mirror of
https://github.com/signalwire/freeswitch.git
synced 2026-07-04 19:31:56 +00:00
update to pcre 7.9
git-svn-id: http://svn.freeswitch.org/svn/freeswitch/trunk@13706 d0543943-73ff-0310-b7d9-9358b9ac24b2
This commit is contained in:
Executable
+296
@@ -0,0 +1,296 @@
|
||||
#! /usr/bin/perl -w
|
||||
|
||||
# Script to turn PCRE man pages into HTML
|
||||
|
||||
|
||||
# Subroutine to handle font changes and other escapes
|
||||
|
||||
sub do_line {
|
||||
my($s) = $_[0];
|
||||
|
||||
$s =~ s/</</g; # Deal with < and >
|
||||
$s =~ s/>/>/g;
|
||||
$s =~ s"\\fI(.*?)\\f[RP]"<i>$1</i>"g;
|
||||
$s =~ s"\\fB(.*?)\\f[RP]"<b>$1</b>"g;
|
||||
$s =~ s"\\e"\\"g;
|
||||
$s =~ s/(?<=Copyright )\(c\)/©/g;
|
||||
$s;
|
||||
}
|
||||
|
||||
# Subroutine to ensure not in a paragraph
|
||||
|
||||
sub end_para {
|
||||
if ($inpara)
|
||||
{
|
||||
print TEMP "</PRE>\n" if ($inpre);
|
||||
print TEMP "</P>\n";
|
||||
}
|
||||
$inpara = $inpre = 0;
|
||||
$wrotetext = 0;
|
||||
}
|
||||
|
||||
# Subroutine to start a new paragraph
|
||||
|
||||
sub new_para {
|
||||
&end_para();
|
||||
print TEMP "<P>\n";
|
||||
$inpara = 1;
|
||||
}
|
||||
|
||||
|
||||
# Main program
|
||||
|
||||
$innf = 0;
|
||||
$inpara = 0;
|
||||
$inpre = 0;
|
||||
$wrotetext = 0;
|
||||
$toc = 0;
|
||||
$ref = 1;
|
||||
|
||||
while ($#ARGV >= 0 && $ARGV[0] =~ /^-/)
|
||||
{
|
||||
$toc = 1 if $ARGV[0] eq "-toc";
|
||||
shift;
|
||||
}
|
||||
|
||||
# Initial output to STDOUT
|
||||
|
||||
print <<End ;
|
||||
<html>
|
||||
<head>
|
||||
<title>$ARGV[0] specification</title>
|
||||
</head>
|
||||
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||
<h1>$ARGV[0] man page</h1>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
<p>
|
||||
This page is part of the PCRE HTML documentation. It was generated automatically
|
||||
from the original man page. If there is any nonsense in it, please consult the
|
||||
man page, in case the conversion went wrong.
|
||||
<br>
|
||||
End
|
||||
|
||||
print "<ul>\n" if ($toc);
|
||||
|
||||
open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n";
|
||||
|
||||
while (<STDIN>)
|
||||
{
|
||||
# Handle lines beginning with a dot
|
||||
|
||||
if (/^\./)
|
||||
{
|
||||
# Some of the PCRE man pages used to contain instances of .br. However,
|
||||
# they should have all been removed because they cause trouble in some
|
||||
# (other) automated systems that translate man pages to HTML. Complain if
|
||||
# we find .br or .in (another macro that is deprecated).
|
||||
|
||||
if (/^\.br/ || /^\.in/)
|
||||
{
|
||||
print STDERR "\n*** Deprecated macro encountered - rewrite needed\n";
|
||||
print STDERR "*** $_\n";
|
||||
die "*** Processing abandoned\n";
|
||||
}
|
||||
|
||||
# Instead of .br, relevent "literal" sections are enclosed in .nf/.fi.
|
||||
|
||||
elsif (/^\.nf/)
|
||||
{
|
||||
$innf = 1;
|
||||
}
|
||||
|
||||
elsif (/^\.fi/)
|
||||
{
|
||||
$innf = 0;
|
||||
}
|
||||
|
||||
# Handling .sp is subtle. If it is inside a literal section, do nothing if
|
||||
# the next line is a non literal text line; similarly, if not inside a
|
||||
# literal section, do nothing if a literal follows. The point being that
|
||||
# the <pre> and </pre> that delimit literal sections will do the spacing.
|
||||
# Always skip if no previous output.
|
||||
|
||||
elsif (/^\.sp/)
|
||||
{
|
||||
if ($wrotetext)
|
||||
{
|
||||
$_ = <STDIN>;
|
||||
if ($inpre)
|
||||
{
|
||||
print TEMP "\n" if (/^[\s.]/);
|
||||
}
|
||||
else
|
||||
{
|
||||
print TEMP "<br>\n<br>\n" if (!/^[\s.]/);
|
||||
}
|
||||
redo; # Now process the lookahead line we just read
|
||||
}
|
||||
}
|
||||
elsif (/^\.TP/ || /^\.PP/ || /^\.P/)
|
||||
{
|
||||
&new_para();
|
||||
}
|
||||
elsif (/^\.SH\s*("?)(.*)\1/)
|
||||
{
|
||||
# Ignore the NAME section
|
||||
if ($2 =~ /^NAME\b/)
|
||||
{
|
||||
<STDIN>;
|
||||
next;
|
||||
}
|
||||
|
||||
&end_para();
|
||||
my($title) = &do_line($2);
|
||||
if ($toc)
|
||||
{
|
||||
printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n",
|
||||
$ref, $ref);
|
||||
printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n",
|
||||
$ref, $ref);
|
||||
$ref++;
|
||||
}
|
||||
else
|
||||
{
|
||||
print TEMP "<br><b>\n$title\n</b><br>\n";
|
||||
}
|
||||
}
|
||||
elsif (/^\.SS\s*("?)(.*)\1/)
|
||||
{
|
||||
&end_para();
|
||||
my($title) = &do_line($2);
|
||||
print TEMP "<br><b>\n$title\n</b><br>\n";
|
||||
}
|
||||
elsif (/^\.B\s*(.*)/)
|
||||
{
|
||||
&new_para() if (!$inpara);
|
||||
$_ = &do_line($1);
|
||||
s/"(.*?)"/$1/g;
|
||||
print TEMP "<b>$_</b>\n";
|
||||
$wrotetext = 1;
|
||||
}
|
||||
elsif (/^\.I\s*(.*)/)
|
||||
{
|
||||
&new_para() if (!$inpara);
|
||||
$_ = &do_line($1);
|
||||
s/"(.*?)"/$1/g;
|
||||
print TEMP "<i>$_</i>\n";
|
||||
$wrotetext = 1;
|
||||
}
|
||||
|
||||
# A comment that starts "HREF" takes the next line as a name that
|
||||
# is turned into a hyperlink, using the text given, which might be
|
||||
# in a special font. If it ends in () or (digits) or punctuation, they
|
||||
# aren't part of the link.
|
||||
|
||||
elsif (/^\.\\"\s*HREF/)
|
||||
{
|
||||
$_=<STDIN>;
|
||||
chomp;
|
||||
$_ = &do_line($_);
|
||||
$_ =~ s/\s+$//;
|
||||
$_ =~ /^(?:<.>)?([^<(]+)(?:\(\))?(?:<\/.>)?(?:\(\d+\))?[.,;:]?$/;
|
||||
print TEMP "<a href=\"$1.html\">$_</a>\n";
|
||||
}
|
||||
|
||||
# A comment that starts "HTML" inserts literal HTML
|
||||
|
||||
elsif (/^\.\\"\s*HTML\s*(.*)/)
|
||||
{
|
||||
print TEMP $1;
|
||||
}
|
||||
|
||||
# A comment that starts < inserts that HTML at the end of the
|
||||
# *next* input line - so as not to get a newline between them.
|
||||
|
||||
elsif (/^\.\\"\s*(<.*>)/)
|
||||
{
|
||||
my($markup) = $1;
|
||||
$_=<STDIN>;
|
||||
chomp;
|
||||
$_ = &do_line($_);
|
||||
$_ =~ s/\s+$//;
|
||||
print TEMP "$_$markup\n";
|
||||
}
|
||||
|
||||
# A comment that starts JOIN joins the next two lines together, with one
|
||||
# space between them. Then that line is processed. This is used in some
|
||||
# displays where two lines are needed for the "man" version. JOINSH works
|
||||
# the same, except that it assumes this is a shell command, so removes
|
||||
# continuation backslashes.
|
||||
|
||||
elsif (/^\.\\"\s*JOIN(SH)?/)
|
||||
{
|
||||
my($one,$two);
|
||||
$one = <STDIN>;
|
||||
$two = <STDIN>;
|
||||
$one =~ s/\s*\\e\s*$// if (defined($1));
|
||||
chomp($one);
|
||||
$two =~ s/^\s+//;
|
||||
$_ = "$one $two";
|
||||
redo; # Process the joined lines
|
||||
}
|
||||
|
||||
# Ignore anything not recognized
|
||||
|
||||
next;
|
||||
}
|
||||
|
||||
# Line does not begin with a dot. Replace blank lines with new paragraphs
|
||||
|
||||
if (/^\s*$/)
|
||||
{
|
||||
&end_para() if ($wrotetext);
|
||||
next;
|
||||
}
|
||||
|
||||
# Convert fonts changes and output an ordinary line. Ensure that indented
|
||||
# lines are marked as literal.
|
||||
|
||||
$_ = &do_line($_);
|
||||
&new_para() if (!$inpara);
|
||||
|
||||
if (/^\s/)
|
||||
{
|
||||
if (!$inpre)
|
||||
{
|
||||
print TEMP "<pre>\n";
|
||||
$inpre = 1;
|
||||
}
|
||||
}
|
||||
elsif ($inpre)
|
||||
{
|
||||
print TEMP "</pre>\n";
|
||||
$inpre = 0;
|
||||
}
|
||||
|
||||
# Add <br> to the end of a non-literal line if we are within .nf/.fi
|
||||
|
||||
$_ .= "<br>\n" if (!$inpre && $innf);
|
||||
|
||||
print TEMP;
|
||||
$wrotetext = 1;
|
||||
}
|
||||
|
||||
# The TOC, if present, will have been written - terminate it
|
||||
|
||||
print "</ul>\n" if ($toc);
|
||||
|
||||
# Copy the remainder to the standard output
|
||||
|
||||
close(TEMP);
|
||||
open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n";
|
||||
|
||||
print while (<TEMP>);
|
||||
|
||||
print <<End ;
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
End
|
||||
|
||||
close(TEMP);
|
||||
unlink("/tmp/$$");
|
||||
|
||||
# End
|
||||
+3
-3
@@ -6,9 +6,9 @@ Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
|
||||
University of Cambridge Computing Service,
|
||||
Cambridge, England. Phone: +44 1223 334714.
|
||||
Cambridge, England.
|
||||
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
All rights reserved
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ THE C++ WRAPPER LIBRARY
|
||||
|
||||
Written by: Google Inc.
|
||||
|
||||
Copyright (c) 2006 Google Inc
|
||||
Copyright (c) 2007-2008 Google Inc
|
||||
All rights reserved
|
||||
|
||||
####
|
||||
|
||||
+565
-4
@@ -1,17 +1,578 @@
|
||||
cmake_minimum_required(VERSION 2.6)
|
||||
# CMakeLists.txt
|
||||
#
|
||||
#
|
||||
# This file allows building PCRE with the CMake configuration and build
|
||||
# tool. Download CMake in source or binary form from http://www.cmake.org/
|
||||
#
|
||||
# Original listfile by Christian Ehrlicher <Ch.Ehrlicher@gmx.de>
|
||||
# Refined and expanded by Daniel Richard G. <skunk@iSKUNK.ORG>
|
||||
# 2007-09-14 mod by Sheri so 7.4 supported configuration options can be entered
|
||||
# 2007-09-19 Adjusted by PH to retain previous default settings
|
||||
# 2007-12-26 (a) On UNIX, use names libpcre instead of just pcre
|
||||
# (b) Ensure pcretest and pcregrep link with the local library,
|
||||
# not a previously-installed one.
|
||||
# (c) Add PCRE_SUPPORT_LIBREADLINE, PCRE_SUPPORT_LIBZ, and
|
||||
# PCRE_SUPPORT_LIBBZ2.
|
||||
# 2008-01-20 Brought up to date to include several new features by Christian
|
||||
# Ehrlicher.
|
||||
# 2008-01-22 Sheri added options for backward compatibility of library names
|
||||
# when building with minGW:
|
||||
# if "ON", NON_STANDARD_LIB_PREFIX causes shared libraries to
|
||||
# be built without "lib" as prefix. (The libraries will be named
|
||||
# pcre.dll, pcreposix.dll and pcrecpp.dll).
|
||||
# if "ON", NON_STANDARD_LIB_SUFFIX causes shared libraries to
|
||||
# be built with suffix of "-0.dll". (The libraries will be named
|
||||
# libpcre-0.dll, libpcreposix-0.dll and libpcrecpp-0.dll - same names
|
||||
# built by default with Configure and Make.
|
||||
# 2008-01-23 PH removed the automatic build of pcredemo.
|
||||
# 2008-04-22 PH modified READLINE support so it finds NCURSES when needed.
|
||||
# 2008-07-03 PH updated for revised UCP property support (change of files)
|
||||
# 2009-03-23 PH applied Steven Van Ingelgem's patch to change the name
|
||||
# CMAKE_BINARY_DIR to PROJECT_BINARY_DIR so that it works when PCRE
|
||||
# is included within another project.
|
||||
# 2009-03-23 PH applied a modified version of Steven Van Ingelgem's patches to
|
||||
# add options to stop the building of pcregrep and the tests, and
|
||||
# to disable the final configuration report.
|
||||
# 2009-04-11 PH applied Christian Ehrlicher's patch to show compiler flags that
|
||||
# are set by specifying a release type.
|
||||
|
||||
INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/libs/pcre/include/ ${CMAKE_SOURCE_DIR}/libs/pcre/)
|
||||
PROJECT(PCRE C CXX)
|
||||
|
||||
SET ( pcre_SRCS pcre_compile.c pcre_tables.c pcre_config.c pcre_try_flipped.c pcre_dfa_exec.c pcre_ucp_searchfuncs.c pcre_exec.c pcre_valid_utf8.c pcre_fullinfo.c pcre_version.c dftables.c pcre_get.c pcre_xclass.c pcre_globals.c pcre_info.c pcrecpp.h pcre_internal.h pcre_maketables.c pcrecpparg.h pcre_ord2utf8.c pcredemo.c pcre_refcount.c pcregrep.c pcreposix.c pcre_scanner.h pcreposix.h pcre_scanner_unittest.cc pcretest.c pcre_stringpiece.h pcre_stringpiece.h.in ucp.h pcre.h ucpinternal.h pcre_chartables.c pcre.h )
|
||||
CMAKE_MINIMUM_REQUIRED(VERSION 2.4.6)
|
||||
|
||||
SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) # for FindReadline.cmake
|
||||
|
||||
# external packages
|
||||
FIND_PACKAGE( BZip2 )
|
||||
FIND_PACKAGE( ZLIB )
|
||||
FIND_PACKAGE( Readline )
|
||||
|
||||
# Configuration checks
|
||||
|
||||
INCLUDE(CheckIncludeFile)
|
||||
INCLUDE(CheckIncludeFileCXX)
|
||||
INCLUDE(CheckFunctionExists)
|
||||
INCLUDE(CheckTypeSize)
|
||||
|
||||
CHECK_INCLUDE_FILE(dirent.h HAVE_DIRENT_H)
|
||||
CHECK_INCLUDE_FILE(sys/stat.h HAVE_SYS_STAT_H)
|
||||
CHECK_INCLUDE_FILE(sys/types.h HAVE_SYS_TYPES_H)
|
||||
CHECK_INCLUDE_FILE(unistd.h HAVE_UNISTD_H)
|
||||
CHECK_INCLUDE_FILE(windows.h HAVE_WINDOWS_H)
|
||||
|
||||
CHECK_INCLUDE_FILE_CXX(type_traits.h HAVE_TYPE_TRAITS_H)
|
||||
CHECK_INCLUDE_FILE_CXX(bits/type_traits.h HAVE_BITS_TYPE_TRAITS_H)
|
||||
|
||||
CHECK_FUNCTION_EXISTS(bcopy HAVE_BCOPY)
|
||||
CHECK_FUNCTION_EXISTS(memmove HAVE_MEMMOVE)
|
||||
CHECK_FUNCTION_EXISTS(strerror HAVE_STRERROR)
|
||||
CHECK_FUNCTION_EXISTS(strtoll HAVE_STRTOLL)
|
||||
CHECK_FUNCTION_EXISTS(strtoq HAVE_STRTOQ)
|
||||
CHECK_FUNCTION_EXISTS(_strtoi64 HAVE__STRTOI64)
|
||||
|
||||
CHECK_TYPE_SIZE("long long" LONG_LONG)
|
||||
CHECK_TYPE_SIZE("unsigned long long" UNSIGNED_LONG_LONG)
|
||||
|
||||
# User-configurable options
|
||||
#
|
||||
# (Note: CMakeSetup displays these in alphabetical order, regardless of
|
||||
# the order we use here)
|
||||
|
||||
SET(BUILD_SHARED_LIBS OFF CACHE BOOL
|
||||
"Build shared libraries instead of static ones.")
|
||||
|
||||
OPTION(PCRE_BUILD_PCRECPP "Build the PCRE C++ library (pcrecpp)." ON)
|
||||
|
||||
SET(PCRE_EBCDIC OFF CACHE BOOL
|
||||
"Use EBCDIC coding instead of ASCII. (This is rarely used outside of mainframe systems)")
|
||||
|
||||
SET(PCRE_LINK_SIZE "2" CACHE STRING
|
||||
"Internal link size (2, 3 or 4 allowed). See LINK_SIZE in config.h.in for details.")
|
||||
|
||||
SET(PCRE_MATCH_LIMIT "10000000" CACHE STRING
|
||||
"Default limit on internal looping. See MATCH_LIMIT in config.h.in for details.")
|
||||
|
||||
SET(PCRE_MATCH_LIMIT_RECURSION "MATCH_LIMIT" CACHE STRING
|
||||
"Default limit on internal recursion. See MATCH_LIMIT_RECURSION in config.h.in for details.")
|
||||
|
||||
SET(PCRE_NEWLINE "LF" CACHE STRING
|
||||
"What to recognize as a newline (one of CR, LF, CRLF, ANY, ANYCRLF).")
|
||||
|
||||
SET(PCRE_NO_RECURSE OFF CACHE BOOL
|
||||
"If ON, then don't use stack recursion when matching. See NO_RECURSE in config.h.in for details.")
|
||||
|
||||
SET(PCRE_POSIX_MALLOC_THRESHOLD "10" CACHE STRING
|
||||
"Threshold for malloc() usage. See POSIX_MALLOC_THRESHOLD in config.h.in for details.")
|
||||
|
||||
SET(PCRE_SUPPORT_UNICODE_PROPERTIES OFF CACHE BOOL
|
||||
"Enable support for Unicode properties. (If set, UTF-8 support will be enabled as well)")
|
||||
|
||||
SET(PCRE_SUPPORT_UTF8 OFF CACHE BOOL
|
||||
"Enable support for the Unicode UTF-8 encoding.")
|
||||
|
||||
SET(PCRE_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL
|
||||
"ON=Backslash-R matches only LF CR and CRLF, OFF=Backslash-R matches all Unicode Linebreaks")
|
||||
|
||||
OPTION(PCRE_SHOW_REPORT "Show the final configuration report" ON)
|
||||
OPTION(PCRE_BUILD_PCREGREP "Build pcregrep" ON)
|
||||
OPTION(PCRE_BUILD_TESTS "Build the tests" ON)
|
||||
|
||||
IF (PCRE_BUILD_TESTS)
|
||||
IF (NOT PCRE_BUILD_PCREGREP)
|
||||
MESSAGE(STATUS "** Building tests requires pcregrep: PCRE_BUILD_PCREGREP forced ON")
|
||||
SET(PCRE_BUILD_PCREGREP ON)
|
||||
ENDIF(NOT PCRE_BUILD_PCREGREP)
|
||||
ENDIF(PCRE_BUILD_TESTS)
|
||||
|
||||
IF (MINGW)
|
||||
OPTION(NON_STANDARD_LIB_PREFIX
|
||||
"ON=Shared libraries built in mingw will be named pcre.dll, etc., instead of libpcre.dll, etc."
|
||||
OFF)
|
||||
|
||||
OPTION(NON_STANDARD_LIB_SUFFIX
|
||||
"ON=Shared libraries built in mingw will be named libpcre-0.dll, etc., instead of libpcre.dll, etc."
|
||||
OFF)
|
||||
ENDIF(MINGW)
|
||||
|
||||
# bzip2 lib
|
||||
IF(BZIP2_FOUND)
|
||||
OPTION (PCRE_SUPPORT_LIBBZ2 "Enable support for linking pcregrep with libbz2." ON)
|
||||
ENDIF(BZIP2_FOUND)
|
||||
IF(PCRE_SUPPORT_LIBBZ2)
|
||||
INCLUDE_DIRECTORIES(${BZIP2_INCLUDE_DIR})
|
||||
ENDIF(PCRE_SUPPORT_LIBBZ2)
|
||||
|
||||
# zlib
|
||||
IF(ZLIB_FOUND)
|
||||
OPTION (PCRE_SUPPORT_LIBZ "Enable support for linking pcregrep with libz." ON)
|
||||
ENDIF(ZLIB_FOUND)
|
||||
IF(PCRE_SUPPORT_LIBZ)
|
||||
INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
|
||||
ENDIF(PCRE_SUPPORT_LIBZ)
|
||||
|
||||
# readline lib
|
||||
IF(READLINE_FOUND)
|
||||
OPTION (PCRE_SUPPORT_LIBREADLINE "Enable support for linking pcretest with libreadline." ON)
|
||||
ENDIF(READLINE_FOUND)
|
||||
IF(PCRE_SUPPORT_LIBREADLINE)
|
||||
INCLUDE_DIRECTORIES(${READLINE_INCLUDE_DIR})
|
||||
ENDIF(PCRE_SUPPORT_LIBREADLINE)
|
||||
|
||||
# Prepare build configuration
|
||||
|
||||
SET(pcre_have_type_traits 0)
|
||||
SET(pcre_have_bits_type_traits 0)
|
||||
|
||||
IF(HAVE_TYPE_TRAITS_H)
|
||||
SET(pcre_have_type_traits 1)
|
||||
ENDIF(HAVE_TYPE_TRAITS_H)
|
||||
|
||||
IF(HAVE_BITS_TYPE_TRAITS_H)
|
||||
SET(pcre_have_bits_type_traits 1)
|
||||
ENDIF(HAVE_BITS_TYPE_TRAITS_H)
|
||||
|
||||
SET(pcre_have_long_long 0)
|
||||
SET(pcre_have_ulong_long 0)
|
||||
|
||||
IF(HAVE_LONG_LONG)
|
||||
SET(pcre_have_long_long 1)
|
||||
ENDIF(HAVE_LONG_LONG)
|
||||
|
||||
IF(HAVE_UNSIGNED_LONG_LONG)
|
||||
SET(pcre_have_ulong_long 1)
|
||||
ENDIF(HAVE_UNSIGNED_LONG_LONG)
|
||||
|
||||
IF(NOT BUILD_SHARED_LIBS)
|
||||
SET(PCRE_STATIC 1)
|
||||
ENDIF(NOT BUILD_SHARED_LIBS)
|
||||
|
||||
IF(PCRE_SUPPORT_BSR_ANYCRLF)
|
||||
SET(BSR_ANYCRLF 1)
|
||||
ENDIF(PCRE_SUPPORT_BSR_ANYCRLF)
|
||||
|
||||
IF(PCRE_SUPPORT_UTF8 OR PCRE_SUPPORT_UNICODE_PROPERTIES)
|
||||
SET(SUPPORT_UTF8 1)
|
||||
ENDIF(PCRE_SUPPORT_UTF8 OR PCRE_SUPPORT_UNICODE_PROPERTIES)
|
||||
|
||||
IF(PCRE_SUPPORT_UNICODE_PROPERTIES)
|
||||
SET(SUPPORT_UCP 1)
|
||||
ENDIF(PCRE_SUPPORT_UNICODE_PROPERTIES)
|
||||
|
||||
# This next one used to contain
|
||||
# SET(PCRETEST_LIBS ${READLINE_LIBRARY})
|
||||
# but I was advised to add the NCURSES test as well, along with
|
||||
# some modifications to cmake/FindReadline.cmake which should
|
||||
# make it possible to override the default if necessary. PH
|
||||
|
||||
IF(PCRE_SUPPORT_LIBREADLINE)
|
||||
SET(SUPPORT_LIBREADLINE 1)
|
||||
SET(PCRETEST_LIBS ${READLINE_LIBRARY} ${NCURSES_LIBRARY})
|
||||
ENDIF(PCRE_SUPPORT_LIBREADLINE)
|
||||
|
||||
IF(PCRE_SUPPORT_LIBZ)
|
||||
SET(SUPPORT_LIBZ 1)
|
||||
SET(PCREGREP_LIBS ${PCREGREP_LIBS} ${ZLIB_LIBRARIES})
|
||||
ENDIF(PCRE_SUPPORT_LIBZ)
|
||||
|
||||
IF(PCRE_SUPPORT_LIBBZ2)
|
||||
SET(SUPPORT_LIBBZ2 1)
|
||||
SET(PCREGREP_LIBS ${PCREGREP_LIBS} ${BZIP2_LIBRARIES})
|
||||
ENDIF(PCRE_SUPPORT_LIBBZ2)
|
||||
|
||||
SET(NEWLINE "")
|
||||
|
||||
IF(PCRE_NEWLINE STREQUAL "LF")
|
||||
SET(NEWLINE "10")
|
||||
ENDIF(PCRE_NEWLINE STREQUAL "LF")
|
||||
IF(PCRE_NEWLINE STREQUAL "CR")
|
||||
SET(NEWLINE "13")
|
||||
ENDIF(PCRE_NEWLINE STREQUAL "CR")
|
||||
IF(PCRE_NEWLINE STREQUAL "CRLF")
|
||||
SET(NEWLINE "3338")
|
||||
ENDIF(PCRE_NEWLINE STREQUAL "CRLF")
|
||||
IF(PCRE_NEWLINE STREQUAL "ANY")
|
||||
SET(NEWLINE "-1")
|
||||
ENDIF(PCRE_NEWLINE STREQUAL "ANY")
|
||||
IF(PCRE_NEWLINE STREQUAL "ANYCRLF")
|
||||
SET(NEWLINE "-2")
|
||||
ENDIF(PCRE_NEWLINE STREQUAL "ANYCRLF")
|
||||
|
||||
IF(NEWLINE STREQUAL "")
|
||||
MESSAGE(FATAL_ERROR "The PCRE_NEWLINE variable must be set to one of the following values: \"LF\", \"CR\", \"CRLF\", \"ANY\", \"ANYCRLF\".")
|
||||
ENDIF(NEWLINE STREQUAL "")
|
||||
|
||||
IF(PCRE_EBCDIC)
|
||||
SET(EBCDIC 1)
|
||||
ENDIF(PCRE_EBCDIC)
|
||||
|
||||
IF(PCRE_NO_RECURSE)
|
||||
SET(NO_RECURSE 1)
|
||||
ENDIF(PCRE_NO_RECURSE)
|
||||
|
||||
# Output files
|
||||
CONFIGURE_FILE(config-cmake.h.in
|
||||
${PROJECT_BINARY_DIR}/config.h
|
||||
@ONLY)
|
||||
|
||||
CONFIGURE_FILE(pcre.h.generic
|
||||
${PROJECT_BINARY_DIR}/pcre.h
|
||||
COPYONLY)
|
||||
|
||||
# What about pcre-config and libpcre.pc?
|
||||
|
||||
IF(PCRE_BUILD_PCRECPP)
|
||||
CONFIGURE_FILE(pcre_stringpiece.h.in
|
||||
${PROJECT_BINARY_DIR}/pcre_stringpiece.h
|
||||
@ONLY)
|
||||
|
||||
CONFIGURE_FILE(pcrecpparg.h.in
|
||||
${PROJECT_BINARY_DIR}/pcrecpparg.h
|
||||
@ONLY)
|
||||
ENDIF(PCRE_BUILD_PCRECPP)
|
||||
|
||||
# Character table generation
|
||||
|
||||
OPTION(PCRE_REBUILD_CHARTABLES "Rebuild char tables" OFF)
|
||||
IF(PCRE_REBUILD_CHARTABLES)
|
||||
ADD_EXECUTABLE(dftables dftables.c)
|
||||
|
||||
GET_TARGET_PROPERTY(DFTABLES_EXE dftables LOCATION)
|
||||
|
||||
ADD_CUSTOM_COMMAND(
|
||||
COMMENT "Generating character tables (pcre_chartables.c) for current locale"
|
||||
DEPENDS dftables
|
||||
COMMAND ${DFTABLES_EXE}
|
||||
ARGS ${PROJECT_BINARY_DIR}/pcre_chartables.c
|
||||
OUTPUT ${PROJECT_BINARY_DIR}/pcre_chartables.c
|
||||
)
|
||||
ELSE(PCRE_REBUILD_CHARTABLES)
|
||||
CONFIGURE_FILE(${PROJECT_SOURCE_DIR}/pcre_chartables.c.dist
|
||||
${PROJECT_BINARY_DIR}/pcre_chartables.c
|
||||
COPYONLY)
|
||||
ENDIF(PCRE_REBUILD_CHARTABLES)
|
||||
|
||||
# Source code
|
||||
|
||||
SET(PCRE_HEADERS ${PROJECT_BINARY_DIR}/pcre.h)
|
||||
|
||||
SET(PCRE_SOURCES
|
||||
${PROJECT_BINARY_DIR}/pcre_chartables.c
|
||||
pcre_compile.c
|
||||
pcre_config.c
|
||||
pcre_dfa_exec.c
|
||||
pcre_exec.c
|
||||
pcre_fullinfo.c
|
||||
pcre_get.c
|
||||
pcre_globals.c
|
||||
pcre_info.c
|
||||
pcre_newline.c
|
||||
pcre_maketables.c
|
||||
pcre_ord2utf8.c
|
||||
pcre_refcount.c
|
||||
pcre_study.c
|
||||
pcre_tables.c
|
||||
pcre_try_flipped.c
|
||||
pcre_ucd.c
|
||||
pcre_valid_utf8.c
|
||||
pcre_version.c
|
||||
pcre_xclass.c
|
||||
)
|
||||
|
||||
SET(PCREPOSIX_HEADERS pcreposix.h)
|
||||
|
||||
SET(PCREPOSIX_SOURCES pcreposix.c)
|
||||
|
||||
SET(PCRECPP_HEADERS
|
||||
pcrecpp.h
|
||||
pcre_scanner.h
|
||||
${PROJECT_BINARY_DIR}/pcrecpparg.h
|
||||
${PROJECT_BINARY_DIR}/pcre_stringpiece.h
|
||||
)
|
||||
|
||||
SET(PCRECPP_SOURCES
|
||||
pcrecpp.cc
|
||||
pcre_scanner.cc
|
||||
pcre_stringpiece.cc
|
||||
)
|
||||
|
||||
# Build setup
|
||||
|
||||
ADD_DEFINITIONS(-DHAVE_CONFIG_H)
|
||||
|
||||
IF(MSVC)
|
||||
ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE)
|
||||
ENDIF(MSVC)
|
||||
|
||||
SET(CMAKE_INCLUDE_CURRENT_DIR 1)
|
||||
# needed to make sure to not link debug libs
|
||||
# against release libs and vice versa
|
||||
IF(WIN32)
|
||||
SET(CMAKE_DEBUG_POSTFIX "d")
|
||||
ENDIF(WIN32)
|
||||
|
||||
SET(targets)
|
||||
|
||||
# Libraries
|
||||
# pcre
|
||||
ADD_LIBRARY(pcre ${PCRE_HEADERS} ${PCRE_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
SET(targets ${targets} pcre)
|
||||
ADD_LIBRARY(pcreposix ${PCREPOSIX_HEADERS} ${PCREPOSIX_SOURCES})
|
||||
SET(targets ${targets} pcreposix)
|
||||
TARGET_LINK_LIBRARIES(pcreposix pcre)
|
||||
IF(MINGW AND NOT PCRE_STATIC)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre pcreposix
|
||||
PROPERTIES PREFIX ""
|
||||
)
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre pcreposix
|
||||
PROPERTIES SUFFIX "-0.dll"
|
||||
)
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW AND NOT PCRE_STATIC)
|
||||
|
||||
|
||||
# pcrecpp
|
||||
IF(PCRE_BUILD_PCRECPP)
|
||||
ADD_LIBRARY(pcrecpp ${PCRECPP_HEADERS} ${PCRECPP_SOURCES})
|
||||
SET(targets ${targets} pcrecpp)
|
||||
TARGET_LINK_LIBRARIES(pcrecpp pcre)
|
||||
|
||||
IF(MINGW AND NOT PCRE_STATIC)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcrecpp
|
||||
PROPERTIES PREFIX ""
|
||||
)
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcrecpp
|
||||
PROPERTIES SUFFIX "-0.dll"
|
||||
)
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW AND NOT PCRE_STATIC)
|
||||
ENDIF(PCRE_BUILD_PCRECPP)
|
||||
|
||||
|
||||
ADD_LIBRARY(pcre STATIC ${pcre_SRCS})
|
||||
# Executables
|
||||
|
||||
# Removed by PH (2008-01-23) because pcredemo shouldn't really be built
|
||||
# automatically, and it gave trouble in some environments anyway.
|
||||
# ADD_EXECUTABLE(pcredemo pcredemo.c)
|
||||
# TARGET_LINK_LIBRARIES(pcredemo pcreposix)
|
||||
# IF(NOT BUILD_SHARED_LIBS)
|
||||
# # make sure to not use declspec(dllimport) in static mode on windows
|
||||
# SET_TARGET_PROPERTIES(pcredemo PROPERTIES COMPILE_FLAGS "-DPCRE_STATIC")
|
||||
# ENDIF(NOT BUILD_SHARED_LIBS)
|
||||
|
||||
IF(PCRE_BUILD_PCREGREP)
|
||||
ADD_EXECUTABLE(pcregrep pcregrep.c)
|
||||
SET(targets ${targets} pcregrep)
|
||||
TARGET_LINK_LIBRARIES(pcregrep pcreposix ${PCREGREP_LIBS})
|
||||
ENDIF(PCRE_BUILD_PCREGREP)
|
||||
|
||||
|
||||
# Testing
|
||||
IF(PCRE_BUILD_TESTS)
|
||||
ENABLE_TESTING()
|
||||
|
||||
ADD_EXECUTABLE(pcretest pcretest.c)
|
||||
SET(targets ${targets} pcretest)
|
||||
TARGET_LINK_LIBRARIES(pcretest pcreposix ${PCRETEST_LIBS})
|
||||
|
||||
IF(PCRE_BUILD_PCRECPP)
|
||||
ADD_EXECUTABLE(pcrecpp_unittest pcrecpp_unittest.cc)
|
||||
SET(targets ${targets} pcrecpp_unittest)
|
||||
TARGET_LINK_LIBRARIES(pcrecpp_unittest pcrecpp)
|
||||
IF(MINGW AND NON_STANDARD_LIB_NAMES AND NOT PCRE_STATIC)
|
||||
SET_TARGET_PROPERTIES(pcrecpp
|
||||
PROPERTIES PREFIX ""
|
||||
)
|
||||
ENDIF(MINGW AND NON_STANDARD_LIB_NAMES AND NOT PCRE_STATIC)
|
||||
|
||||
|
||||
ADD_EXECUTABLE(pcre_scanner_unittest pcre_scanner_unittest.cc)
|
||||
SET(targets ${targets} pcre_scanner_unittest)
|
||||
TARGET_LINK_LIBRARIES(pcre_scanner_unittest pcrecpp)
|
||||
|
||||
ADD_EXECUTABLE(pcre_stringpiece_unittest pcre_stringpiece_unittest.cc)
|
||||
SET(targets ${targets} pcre_stringpiece_unittest)
|
||||
TARGET_LINK_LIBRARIES(pcre_stringpiece_unittest pcrecpp)
|
||||
ENDIF(PCRE_BUILD_PCRECPP)
|
||||
|
||||
GET_TARGET_PROPERTY(PCREGREP_EXE pcregrep DEBUG_LOCATION)
|
||||
GET_TARGET_PROPERTY(PCRETEST_EXE pcretest DEBUG_LOCATION)
|
||||
|
||||
# Write out a CTest configuration file that sets some needed environment
|
||||
# variables for the test scripts.
|
||||
#
|
||||
FILE(WRITE ${PROJECT_BINARY_DIR}/CTestCustom.ctest
|
||||
"# This is a generated file.
|
||||
SET(ENV{srcdir} ${PROJECT_SOURCE_DIR})
|
||||
SET(ENV{pcregrep} ${PCREGREP_EXE})
|
||||
SET(ENV{pcretest} ${PCRETEST_EXE})
|
||||
")
|
||||
|
||||
IF(UNIX)
|
||||
ADD_TEST(pcre_test ${PROJECT_SOURCE_DIR}/RunTest)
|
||||
ADD_TEST(pcre_grep_test ${PROJECT_SOURCE_DIR}/RunGrepTest)
|
||||
ENDIF(UNIX)
|
||||
IF(WIN32)
|
||||
ADD_TEST(pcre_test cmd /C ${PROJECT_SOURCE_DIR}/RunTest.bat)
|
||||
ENDIF(WIN32)
|
||||
|
||||
GET_TARGET_PROPERTY(PCRECPP_UNITTEST_EXE
|
||||
pcrecpp_unittest
|
||||
DEBUG_LOCATION)
|
||||
|
||||
GET_TARGET_PROPERTY(PCRE_SCANNER_UNITTEST_EXE
|
||||
pcre_scanner_unittest
|
||||
DEBUG_LOCATION)
|
||||
|
||||
GET_TARGET_PROPERTY(PCRE_STRINGPIECE_UNITTEST_EXE
|
||||
pcre_stringpiece_unittest
|
||||
DEBUG_LOCATION)
|
||||
|
||||
ADD_TEST(pcrecpp_test ${PCRECPP_UNITTEST_EXE})
|
||||
ADD_TEST(pcre_scanner_test ${PCRE_SCANNER_UNITTEST_EXE})
|
||||
ADD_TEST(pcre_stringpiece_test ${PCRE_STRINGPIECE_UNITTEST_EXE})
|
||||
ENDIF(PCRE_BUILD_TESTS)
|
||||
|
||||
# Installation
|
||||
SET(CMAKE_INSTALL_ALWAYS 1)
|
||||
|
||||
INSTALL(TARGETS ${targets}
|
||||
RUNTIME DESTINATION bin
|
||||
LIBRARY DESTINATION lib
|
||||
ARCHIVE DESTINATION lib)
|
||||
|
||||
INSTALL(FILES ${PCRE_HEADERS} ${PCREPOSIX_HEADERS} DESTINATION include)
|
||||
|
||||
FILE(GLOB html ${PROJECT_SOURCE_DIR}/doc/html/*.html)
|
||||
FILE(GLOB man1 ${PROJECT_SOURCE_DIR}/doc/*.1)
|
||||
FILE(GLOB man3 ${PROJECT_SOURCE_DIR}/doc/*.3)
|
||||
|
||||
IF(PCRE_BUILD_PCRECPP)
|
||||
INSTALL(FILES ${PCRECPP_HEADERS} DESTINATION include)
|
||||
ELSE(PCRE_BUILD_PCRECPP)
|
||||
# Remove pcrecpp.3
|
||||
FOREACH(man ${man3})
|
||||
GET_FILENAME_COMPONENT(man_tmp ${man} NAME)
|
||||
IF(NOT man_tmp STREQUAL "pcrecpp.3")
|
||||
SET(man3_new ${man3} ${man})
|
||||
ENDIF(NOT man_tmp STREQUAL "pcrecpp.3")
|
||||
ENDFOREACH(man ${man3})
|
||||
SET(man3 ${man3_new})
|
||||
ENDIF(PCRE_BUILD_PCRECPP)
|
||||
|
||||
|
||||
INSTALL(FILES ${man1} DESTINATION man/man1)
|
||||
INSTALL(FILES ${man3} DESTINATION man/man3)
|
||||
INSTALL(FILES ${html} DESTINATION share/doc/pcre/html)
|
||||
|
||||
# help, only for nice output
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
SET(BUILD_STATIC_LIBS OFF)
|
||||
ELSE(BUILD_SHARED_LIBS)
|
||||
SET(BUILD_STATIC_LIBS ON)
|
||||
ENDIF(BUILD_SHARED_LIBS)
|
||||
|
||||
IF(PCRE_SHOW_REPORT)
|
||||
STRING(TOUPPER "${CMAKE_BUILD_TYPE}" buildtype)
|
||||
IF (CMAKE_C_FLAGS)
|
||||
SET(cfsp " ")
|
||||
ENDIF(CMAKE_C_FLAGS)
|
||||
IF (CMAKE_CXX_FLAGS)
|
||||
SET(cxxfsp " ")
|
||||
ENDIF(CMAKE_CXX_FLAGS)
|
||||
MESSAGE(STATUS "")
|
||||
MESSAGE(STATUS "")
|
||||
MESSAGE(STATUS "PCRE configuration summary:")
|
||||
MESSAGE(STATUS "")
|
||||
MESSAGE(STATUS " Install prefix .................. : ${CMAKE_INSTALL_PREFIX}")
|
||||
MESSAGE(STATUS " C compiler ...................... : ${CMAKE_C_COMPILER}")
|
||||
MESSAGE(STATUS " C++ compiler .................... : ${CMAKE_CXX_COMPILER}")
|
||||
MESSAGE(STATUS " C compiler flags ................ : ${CMAKE_C_FLAGS}${cfsp}${CMAKE_C_FLAGS_${buildtype}}")
|
||||
MESSAGE(STATUS " C++ compiler flags .............. : ${CMAKE_CXX_FLAGS}${cxxfsp}${CMAKE_CXX_FLAGS_${buildtype}}")
|
||||
MESSAGE(STATUS "")
|
||||
MESSAGE(STATUS " Build C++ library ............... : ${PCRE_BUILD_PCRECPP}")
|
||||
MESSAGE(STATUS " Enable UTF-8 support ............ : ${PCRE_SUPPORT_UNICODE_PROPERTIES}")
|
||||
MESSAGE(STATUS " Unicode properties .............. : ${PCRE_SUPPORT_UNICODE_PROPERTIES}")
|
||||
MESSAGE(STATUS " Newline char/sequence ........... : ${PCRE_NEWLINE}")
|
||||
MESSAGE(STATUS " \\R matches only ANYCRLF ......... : ${PCRE_SUPPORT_BSR_ANYCRLF}")
|
||||
MESSAGE(STATUS " EBCDIC coding ................... : ${PCRE_EBCDIC}")
|
||||
MESSAGE(STATUS " Rebuild char tables ............. : ${PCRE_REBUILD_CHARTABLES}")
|
||||
MESSAGE(STATUS " No stack recursion .............. : ${PCRE_NO_RECURSE}")
|
||||
MESSAGE(STATUS " POSIX mem threshold ............. : ${PCRE_POSIX_MALLOC_THRESHOLD}")
|
||||
MESSAGE(STATUS " Internal link size .............. : ${PCRE_LINK_SIZE}")
|
||||
MESSAGE(STATUS " Match limit ..................... : ${PCRE_MATCH_LIMIT}")
|
||||
MESSAGE(STATUS " Match limit recursion ........... : ${PCRE_MATCH_LIMIT_RECURSION}")
|
||||
MESSAGE(STATUS " Build shared libs ............... : ${BUILD_SHARED_LIBS}")
|
||||
MESSAGE(STATUS " Build static libs ............... : ${BUILD_STATIC_LIBS}")
|
||||
MESSAGE(STATUS " Build pcregrep .................. : ${PCRE_BUILD_PCREGREP}")
|
||||
MESSAGE(STATUS " Build tests (implies pcretest) .. : ${PCRE_BUILD_TESTS}")
|
||||
IF(ZLIB_FOUND)
|
||||
MESSAGE(STATUS " Link pcregrep with libz ......... : ${PCRE_SUPPORT_LIBZ}")
|
||||
ELSE(ZLIB_FOUND)
|
||||
MESSAGE(STATUS " Link pcregrep with libz ......... : None" )
|
||||
ENDIF(ZLIB_FOUND)
|
||||
IF(BZIP2_FOUND)
|
||||
MESSAGE(STATUS " Link pcregrep with libbz2 ....... : ${PCRE_SUPPORT_LIBBZ2}")
|
||||
ELSE(BZIP2_FOUND)
|
||||
MESSAGE(STATUS " Link pcregrep with libbz2 ....... : None" )
|
||||
ENDIF(BZIP2_FOUND)
|
||||
IF(NOT PCRE_SUPPORT_LIBREADLINE)
|
||||
MESSAGE(STATUS " Link pcretest with libreadline .. : None" )
|
||||
ELSE(NOT PCRE_SUPPORT_LIBREADLINE)
|
||||
MESSAGE(STATUS " Link pcretest with libreadline .. : ${PCRE_SUPPORT_LIBREADLINE}")
|
||||
ENDIF(NOT PCRE_SUPPORT_LIBREADLINE)
|
||||
IF(MINGW AND NOT PCRE_STATIC)
|
||||
MESSAGE(STATUS " Non-standard dll names (prefix) . : ${NON_STANDARD_LIB_PREFIX}")
|
||||
MESSAGE(STATUS " Non-standard dll names (suffix) . : ${NON_STANDARD_LIB_SUFFIX}")
|
||||
ENDIF(MINGW AND NOT PCRE_STATIC)
|
||||
MESSAGE(STATUS "")
|
||||
ENDIF(PCRE_SHOW_REPORT)
|
||||
|
||||
# end CMakeLists.txt
|
||||
|
||||
+1
-64
@@ -1,68 +1,5 @@
|
||||
PCRE LICENCE
|
||||
------------
|
||||
|
||||
PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Release 6 of PCRE is distributed under the terms of the "BSD" licence, as
|
||||
specified below. The documentation for PCRE, supplied in the "doc"
|
||||
directory, is distributed under the same terms as the software itself.
|
||||
|
||||
The basic library functions are written in C and are freestanding. Also
|
||||
included in the distribution is a set of C++ wrapper functions.
|
||||
|
||||
|
||||
THE BASIC LIBRARY FUNCTIONS
|
||||
---------------------------
|
||||
|
||||
Written by: Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
|
||||
University of Cambridge Computing Service,
|
||||
Cambridge, England. Phone: +44 1223 334714.
|
||||
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
All rights reserved.
|
||||
|
||||
|
||||
THE C++ WRAPPER FUNCTIONS
|
||||
-------------------------
|
||||
|
||||
Contributed by: Google Inc.
|
||||
|
||||
Copyright (c) 2006, Google Inc.
|
||||
All rights reserved.
|
||||
|
||||
|
||||
THE "BSD" LICENCE
|
||||
-----------------
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the name of Google
|
||||
Inc. nor the names of their contributors may be used to endorse or
|
||||
promote products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
Please see the file LICENCE in the PCRE distribution for licensing details.
|
||||
|
||||
End
|
||||
|
||||
+1229
File diff suppressed because it is too large
Load Diff
Executable
+113
@@ -0,0 +1,113 @@
|
||||
#! /usr/bin/perl -w
|
||||
|
||||
# Script to take the output of nroff -man and remove all the backspacing and
|
||||
# the page footers and the screen commands etc so that it is more usefully
|
||||
# readable online. In fact, in the latest nroff, intermediate footers don't
|
||||
# seem to be generated any more.
|
||||
|
||||
$blankcount = 0;
|
||||
$lastwascut = 0;
|
||||
$firstheader = 1;
|
||||
|
||||
# Input on STDIN; output to STDOUT.
|
||||
|
||||
while (<STDIN>)
|
||||
{
|
||||
s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m"
|
||||
s/.\x8//g; # Remove "char, backspace"
|
||||
|
||||
# Handle header lines. Retain only the first one we encounter, but remove
|
||||
# the blank line that follows. Any others (e.g. at end of document) and the
|
||||
# following blank line are dropped.
|
||||
|
||||
if (/^PCRE(\w*)\(([13])\)\s+PCRE\1\(\2\)$/)
|
||||
{
|
||||
if ($firstheader)
|
||||
{
|
||||
$firstheader = 0;
|
||||
print;
|
||||
$lastprinted = $_;
|
||||
$lastwascut = 0;
|
||||
}
|
||||
$_=<STDIN>; # Remove a blank that follows
|
||||
next;
|
||||
}
|
||||
|
||||
# Count runs of empty lines
|
||||
|
||||
if (/^\s*$/)
|
||||
{
|
||||
$blankcount++;
|
||||
$lastwascut = 0;
|
||||
next;
|
||||
}
|
||||
|
||||
# If a chunk of lines has been cut out (page footer) and the next line
|
||||
# has a different indentation, put back one blank line.
|
||||
|
||||
if ($lastwascut && $blankcount < 1 && defined($lastprinted))
|
||||
{
|
||||
($a) = $lastprinted =~ /^(\s*)/;
|
||||
($b) = $_ =~ /^(\s*)/;
|
||||
$blankcount++ if ($a ne $b);
|
||||
}
|
||||
|
||||
# We get here only when we have a non-blank line in hand. If it was preceded
|
||||
# by 3 or more blank lines, read the next 3 lines and see if they are blank.
|
||||
# If so, remove all 7 lines, and remember that we have just done a cut.
|
||||
|
||||
if ($blankcount >= 3)
|
||||
{
|
||||
for ($i = 0; $i < 3; $i++)
|
||||
{
|
||||
$next[$i] = <STDIN>;
|
||||
$next[$i] = "" if !defined $next[$i];
|
||||
$next[$i] =~ s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m"
|
||||
$next[$i] =~ s/.\x8//g; # Remove "char, backspace"
|
||||
}
|
||||
|
||||
# Cut out chunks of the form <3 blanks><non-blank><3 blanks>
|
||||
|
||||
if ($next[0] =~ /^\s*$/ &&
|
||||
$next[1] =~ /^\s*$/ &&
|
||||
$next[2] =~ /^\s*$/)
|
||||
{
|
||||
$blankcount -= 3;
|
||||
$lastwascut = 1;
|
||||
}
|
||||
|
||||
# Otherwise output the saved blanks, the current, and the next three
|
||||
# lines. Remember the last printed line.
|
||||
|
||||
else
|
||||
{
|
||||
for ($i = 0; $i < $blankcount; $i++) { print "\n"; }
|
||||
print;
|
||||
for ($i = 0; $i < 3; $i++)
|
||||
{
|
||||
$next[$i] =~ s/.\x8//g;
|
||||
print $next[$i];
|
||||
$lastprinted = $_;
|
||||
}
|
||||
$lastwascut = 0;
|
||||
$blankcount = 0;
|
||||
}
|
||||
}
|
||||
|
||||
# This non-blank line is not preceded by 3 or more blank lines. Output
|
||||
# any blanks there are, and the line. Remember it. Force two blank lines
|
||||
# before headings.
|
||||
|
||||
else
|
||||
{
|
||||
$blankcount = 2 if /^\S/ && !/^Last updated/ && !/^Copyright/ &&
|
||||
defined($lastprinted);
|
||||
for ($i = 0; $i < $blankcount; $i++) { print "\n"; }
|
||||
print;
|
||||
$lastprinted = $_;
|
||||
$lastwascut = 0;
|
||||
$blankcount = 0;
|
||||
}
|
||||
}
|
||||
|
||||
# End
|
||||
Executable
+35
@@ -0,0 +1,35 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
# This is a script for removing trailing whitespace from lines in files that
|
||||
# are listed on the command line.
|
||||
|
||||
# This subroutine does the work for one file.
|
||||
|
||||
sub detrail {
|
||||
my($file) = $_[0];
|
||||
my($changed) = 0;
|
||||
open(IN, "$file") || die "Can't open $file for input";
|
||||
@lines = <IN>;
|
||||
close(IN);
|
||||
foreach (@lines)
|
||||
{
|
||||
if (/\s+\n$/)
|
||||
{
|
||||
s/\s+\n$/\n/;
|
||||
$changed = 1;
|
||||
}
|
||||
}
|
||||
if ($changed)
|
||||
{
|
||||
open(OUT, ">$file") || die "Can't open $file for output";
|
||||
print OUT @lines;
|
||||
close(OUT);
|
||||
}
|
||||
}
|
||||
|
||||
# This is the main program
|
||||
|
||||
$, = ""; # Output field separator
|
||||
for ($i = 0; $i < @ARGV; $i++) { &detrail($ARGV[$i]); }
|
||||
|
||||
# End
|
||||
@@ -0,0 +1,418 @@
|
||||
Technical Notes about PCRE
|
||||
--------------------------
|
||||
|
||||
These are very rough technical notes that record potentially useful information
|
||||
about PCRE internals.
|
||||
|
||||
Historical note 1
|
||||
-----------------
|
||||
|
||||
Many years ago I implemented some regular expression functions to an algorithm
|
||||
suggested by Martin Richards. These were not Unix-like in form, and were quite
|
||||
restricted in what they could do by comparison with Perl. The interesting part
|
||||
about the algorithm was that the amount of space required to hold the compiled
|
||||
form of an expression was known in advance. The code to apply an expression did
|
||||
not operate by backtracking, as the original Henry Spencer code and current
|
||||
Perl code does, but instead checked all possibilities simultaneously by keeping
|
||||
a list of current states and checking all of them as it advanced through the
|
||||
subject string. In the terminology of Jeffrey Friedl's book, it was a "DFA
|
||||
algorithm", though it was not a traditional Finite State Machine (FSM). When
|
||||
the pattern was all used up, all remaining states were possible matches, and
|
||||
the one matching the longest subset of the subject string was chosen. This did
|
||||
not necessarily maximize the individual wild portions of the pattern, as is
|
||||
expected in Unix and Perl-style regular expressions.
|
||||
|
||||
Historical note 2
|
||||
-----------------
|
||||
|
||||
By contrast, the code originally written by Henry Spencer (which was
|
||||
subsequently heavily modified for Perl) compiles the expression twice: once in
|
||||
a dummy mode in order to find out how much store will be needed, and then for
|
||||
real. (The Perl version probably doesn't do this any more; I'm talking about
|
||||
the original library.) The execution function operates by backtracking and
|
||||
maximizing (or, optionally, minimizing in Perl) the amount of the subject that
|
||||
matches individual wild portions of the pattern. This is an "NFA algorithm" in
|
||||
Friedl's terminology.
|
||||
|
||||
OK, here's the real stuff
|
||||
-------------------------
|
||||
|
||||
For the set of functions that form the "basic" PCRE library (which are
|
||||
unrelated to those mentioned above), I tried at first to invent an algorithm
|
||||
that used an amount of store bounded by a multiple of the number of characters
|
||||
in the pattern, to save on compiling time. However, because of the greater
|
||||
complexity in Perl regular expressions, I couldn't do this. In any case, a
|
||||
first pass through the pattern is helpful for other reasons.
|
||||
|
||||
Computing the memory requirement: how it was
|
||||
--------------------------------------------
|
||||
|
||||
Up to and including release 6.7, PCRE worked by running a very degenerate first
|
||||
pass to calculate a maximum store size, and then a second pass to do the real
|
||||
compile - which might use a bit less than the predicted amount of memory. The
|
||||
idea was that this would turn out faster than the Henry Spencer code because
|
||||
the first pass is degenerate and the second pass can just store stuff straight
|
||||
into the vector, which it knows is big enough.
|
||||
|
||||
Computing the memory requirement: how it is
|
||||
-------------------------------------------
|
||||
|
||||
By the time I was working on a potential 6.8 release, the degenerate first pass
|
||||
had become very complicated and hard to maintain. Indeed one of the early
|
||||
things I did for 6.8 was to fix Yet Another Bug in the memory computation. Then
|
||||
I had a flash of inspiration as to how I could run the real compile function in
|
||||
a "fake" mode that enables it to compute how much memory it would need, while
|
||||
actually only ever using a few hundred bytes of working memory, and without too
|
||||
many tests of the mode that might slow it down. So I re-factored the compiling
|
||||
functions to work this way. This got rid of about 600 lines of source. It
|
||||
should make future maintenance and development easier. As this was such a major
|
||||
change, I never released 6.8, instead upping the number to 7.0 (other quite
|
||||
major changes are also present in the 7.0 release).
|
||||
|
||||
A side effect of this work is that the previous limit of 200 on the nesting
|
||||
depth of parentheses was removed. However, there is a downside: pcre_compile()
|
||||
runs more slowly than before (30% or more, depending on the pattern) because it
|
||||
is doing a full analysis of the pattern. My hope is that this is not a big
|
||||
issue.
|
||||
|
||||
Traditional matching function
|
||||
-----------------------------
|
||||
|
||||
The "traditional", and original, matching function is called pcre_exec(), and
|
||||
it implements an NFA algorithm, similar to the original Henry Spencer algorithm
|
||||
and the way that Perl works. Not surprising, since it is intended to be as
|
||||
compatible with Perl as possible. This is the function most users of PCRE will
|
||||
use most of the time.
|
||||
|
||||
Supplementary matching function
|
||||
-------------------------------
|
||||
|
||||
From PCRE 6.0, there is also a supplementary matching function called
|
||||
pcre_dfa_exec(). This implements a DFA matching algorithm that searches
|
||||
simultaneously for all possible matches that start at one point in the subject
|
||||
string. (Going back to my roots: see Historical Note 1 above.) This function
|
||||
intreprets the same compiled pattern data as pcre_exec(); however, not all the
|
||||
facilities are available, and those that are do not always work in quite the
|
||||
same way. See the user documentation for details.
|
||||
|
||||
The algorithm that is used for pcre_dfa_exec() is not a traditional FSM,
|
||||
because it may have a number of states active at one time. More work would be
|
||||
needed at compile time to produce a traditional FSM where only one state is
|
||||
ever active at once. I believe some other regex matchers work this way.
|
||||
|
||||
|
||||
Format of compiled patterns
|
||||
---------------------------
|
||||
|
||||
The compiled form of a pattern is a vector of bytes, containing items of
|
||||
variable length. The first byte in an item is an opcode, and the length of the
|
||||
item is either implicit in the opcode or contained in the data bytes that
|
||||
follow it.
|
||||
|
||||
In many cases below LINK_SIZE data values are specified for offsets within the
|
||||
compiled pattern. The default value for LINK_SIZE is 2, but PCRE can be
|
||||
compiled to use 3-byte or 4-byte values for these offsets (impairing the
|
||||
performance). This is necessary only when patterns whose compiled length is
|
||||
greater than 64K are going to be processed. In this description, we assume the
|
||||
"normal" compilation options. Data values that are counts (e.g. for
|
||||
quantifiers) are always just two bytes long.
|
||||
|
||||
A list of the opcodes follows:
|
||||
|
||||
Opcodes with no following data
|
||||
------------------------------
|
||||
|
||||
These items are all just one byte long
|
||||
|
||||
OP_END end of pattern
|
||||
OP_ANY match any one character other than newline
|
||||
OP_ALLANY match any one character, including newline
|
||||
OP_ANYBYTE match any single byte, even in UTF-8 mode
|
||||
OP_SOD match start of data: \A
|
||||
OP_SOM, start of match (subject + offset): \G
|
||||
OP_SET_SOM, set start of match (\K)
|
||||
OP_CIRC ^ (start of data, or after \n in multiline)
|
||||
OP_NOT_WORD_BOUNDARY \W
|
||||
OP_WORD_BOUNDARY \w
|
||||
OP_NOT_DIGIT \D
|
||||
OP_DIGIT \d
|
||||
OP_NOT_HSPACE \H
|
||||
OP_HSPACE \h
|
||||
OP_NOT_WHITESPACE \S
|
||||
OP_WHITESPACE \s
|
||||
OP_NOT_VSPACE \V
|
||||
OP_VSPACE \v
|
||||
OP_NOT_WORDCHAR \W
|
||||
OP_WORDCHAR \w
|
||||
OP_EODN match end of data or \n at end: \Z
|
||||
OP_EOD match end of data: \z
|
||||
OP_DOLL $ (end of data, or before \n in multiline)
|
||||
OP_EXTUNI match an extended Unicode character
|
||||
OP_ANYNL match any Unicode newline sequence
|
||||
|
||||
OP_ACCEPT )
|
||||
OP_COMMIT )
|
||||
OP_FAIL ) These are Perl 5.10's "backtracking
|
||||
OP_PRUNE ) control verbs".
|
||||
OP_SKIP )
|
||||
OP_THEN )
|
||||
|
||||
|
||||
Repeating single characters
|
||||
---------------------------
|
||||
|
||||
The common repeats (*, +, ?) when applied to a single character use the
|
||||
following opcodes:
|
||||
|
||||
OP_STAR
|
||||
OP_MINSTAR
|
||||
OP_POSSTAR
|
||||
OP_PLUS
|
||||
OP_MINPLUS
|
||||
OP_POSPLUS
|
||||
OP_QUERY
|
||||
OP_MINQUERY
|
||||
OP_POSQUERY
|
||||
|
||||
In ASCII mode, these are two-byte items; in UTF-8 mode, the length is variable.
|
||||
Those with "MIN" in their name are the minimizing versions. Those with "POS" in
|
||||
their names are possessive versions. Each is followed by the character that is
|
||||
to be repeated. Other repeats make use of
|
||||
|
||||
OP_UPTO
|
||||
OP_MINUPTO
|
||||
OP_POSUPTO
|
||||
OP_EXACT
|
||||
|
||||
which are followed by a two-byte count (most significant first) and the
|
||||
repeated character. OP_UPTO matches from 0 to the given number. A repeat with a
|
||||
non-zero minimum and a fixed maximum is coded as an OP_EXACT followed by an
|
||||
OP_UPTO (or OP_MINUPTO or OPT_POSUPTO).
|
||||
|
||||
|
||||
Repeating character types
|
||||
-------------------------
|
||||
|
||||
Repeats of things like \d are done exactly as for single characters, except
|
||||
that instead of a character, the opcode for the type is stored in the data
|
||||
byte. The opcodes are:
|
||||
|
||||
OP_TYPESTAR
|
||||
OP_TYPEMINSTAR
|
||||
OP_TYPEPOSSTAR
|
||||
OP_TYPEPLUS
|
||||
OP_TYPEMINPLUS
|
||||
OP_TYPEPOSPLUS
|
||||
OP_TYPEQUERY
|
||||
OP_TYPEMINQUERY
|
||||
OP_TYPEPOSQUERY
|
||||
OP_TYPEUPTO
|
||||
OP_TYPEMINUPTO
|
||||
OP_TYPEPOSUPTO
|
||||
OP_TYPEEXACT
|
||||
|
||||
|
||||
Match by Unicode property
|
||||
-------------------------
|
||||
|
||||
OP_PROP and OP_NOTPROP are used for positive and negative matches of a
|
||||
character by testing its Unicode property (the \p and \P escape sequences).
|
||||
Each is followed by two bytes that encode the desired property as a type and a
|
||||
value.
|
||||
|
||||
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
|
||||
three bytes: OP_PROP or OP_NOTPROP and then the desired property type and
|
||||
value.
|
||||
|
||||
|
||||
Matching literal characters
|
||||
---------------------------
|
||||
|
||||
The OP_CHAR opcode is followed by a single character that is to be matched
|
||||
casefully. For caseless matching, OP_CHARNC is used. In UTF-8 mode, the
|
||||
character may be more than one byte long. (Earlier versions of PCRE used
|
||||
multi-character strings, but this was changed to allow some new features to be
|
||||
added.)
|
||||
|
||||
|
||||
Character classes
|
||||
-----------------
|
||||
|
||||
If there is only one character, OP_CHAR or OP_CHARNC is used for a positive
|
||||
class, and OP_NOT for a negative one (that is, for something like [^a]).
|
||||
However, in UTF-8 mode, the use of OP_NOT applies only to characters with
|
||||
values < 128, because OP_NOT is confined to single bytes.
|
||||
|
||||
Another set of repeating opcodes (OP_NOTSTAR etc.) are used for a repeated,
|
||||
negated, single-character class. The normal ones (OP_STAR etc.) are used for a
|
||||
repeated positive single-character class.
|
||||
|
||||
When there's more than one character in a class and all the characters are less
|
||||
than 256, OP_CLASS is used for a positive class, and OP_NCLASS for a negative
|
||||
one. In either case, the opcode is followed by a 32-byte bit map containing a 1
|
||||
bit for every character that is acceptable. The bits are counted from the least
|
||||
significant end of each byte.
|
||||
|
||||
The reason for having both OP_CLASS and OP_NCLASS is so that, in UTF-8 mode,
|
||||
subject characters with values greater than 256 can be handled correctly. For
|
||||
OP_CLASS they don't match, whereas for OP_NCLASS they do.
|
||||
|
||||
For classes containing characters with values > 255, OP_XCLASS is used. It
|
||||
optionally uses a bit map (if any characters lie within it), followed by a list
|
||||
of pairs and single characters. There is a flag character than indicates
|
||||
whether it's a positive or a negative class.
|
||||
|
||||
|
||||
Back references
|
||||
---------------
|
||||
|
||||
OP_REF is followed by two bytes containing the reference number.
|
||||
|
||||
|
||||
Repeating character classes and back references
|
||||
-----------------------------------------------
|
||||
|
||||
Single-character classes are handled specially (see above). This section
|
||||
applies to OP_CLASS and OP_REF. In both cases, the repeat information follows
|
||||
the base item. The matching code looks at the following opcode to see if it is
|
||||
one of
|
||||
|
||||
OP_CRSTAR
|
||||
OP_CRMINSTAR
|
||||
OP_CRPLUS
|
||||
OP_CRMINPLUS
|
||||
OP_CRQUERY
|
||||
OP_CRMINQUERY
|
||||
OP_CRRANGE
|
||||
OP_CRMINRANGE
|
||||
|
||||
All but the last two are just single-byte items. The others are followed by
|
||||
four bytes of data, comprising the minimum and maximum repeat counts. There are
|
||||
no special possessive opcodes for these repeats; a possessive repeat is
|
||||
compiled into an atomic group.
|
||||
|
||||
|
||||
Brackets and alternation
|
||||
------------------------
|
||||
|
||||
A pair of non-capturing (round) brackets is wrapped round each expression at
|
||||
compile time, so alternation always happens in the context of brackets.
|
||||
|
||||
[Note for North Americans: "bracket" to some English speakers, including
|
||||
myself, can be round, square, curly, or pointy. Hence this usage.]
|
||||
|
||||
Non-capturing brackets use the opcode OP_BRA. Originally PCRE was limited to 99
|
||||
capturing brackets and it used a different opcode for each one. From release
|
||||
3.5, the limit was removed by putting the bracket number into the data for
|
||||
higher-numbered brackets. From release 7.0 all capturing brackets are handled
|
||||
this way, using the single opcode OP_CBRA.
|
||||
|
||||
A bracket opcode is followed by LINK_SIZE bytes which give the offset to the
|
||||
next alternative OP_ALT or, if there aren't any branches, to the matching
|
||||
OP_KET opcode. Each OP_ALT is followed by LINK_SIZE bytes giving the offset to
|
||||
the next one, or to the OP_KET opcode. For capturing brackets, the bracket
|
||||
number immediately follows the offset, always as a 2-byte item.
|
||||
|
||||
OP_KET is used for subpatterns that do not repeat indefinitely, while
|
||||
OP_KETRMIN and OP_KETRMAX are used for indefinite repetitions, minimally or
|
||||
maximally respectively. All three are followed by LINK_SIZE bytes giving (as a
|
||||
positive number) the offset back to the matching bracket opcode.
|
||||
|
||||
If a subpattern is quantified such that it is permitted to match zero times, it
|
||||
is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are
|
||||
single-byte opcodes that tell the matcher that skipping the following
|
||||
subpattern entirely is a valid branch. In the case of the first two, not
|
||||
skipping the pattern is also valid (greedy and non-greedy). The third is used
|
||||
when a pattern has the quantifier {0,0}. It cannot be entirely discarded,
|
||||
because it may be called as a subroutine from elsewhere in the regex.
|
||||
|
||||
A subpattern with an indefinite maximum repetition is replicated in the
|
||||
compiled data its minimum number of times (or once with OP_BRAZERO if the
|
||||
minimum is zero), with the final copy terminating with OP_KETRMIN or OP_KETRMAX
|
||||
as appropriate.
|
||||
|
||||
A subpattern with a bounded maximum repetition is replicated in a nested
|
||||
fashion up to the maximum number of times, with OP_BRAZERO or OP_BRAMINZERO
|
||||
before each replication after the minimum, so that, for example, (abc){2,5} is
|
||||
compiled as (abc)(abc)((abc)((abc)(abc)?)?)?, except that each bracketed group
|
||||
has the same number.
|
||||
|
||||
When a repeated subpattern has an unbounded upper limit, it is checked to see
|
||||
whether it could match an empty string. If this is the case, the opcode in the
|
||||
final replication is changed to OP_SBRA or OP_SCBRA. This tells the matcher
|
||||
that it needs to check for matching an empty string when it hits OP_KETRMIN or
|
||||
OP_KETRMAX, and if so, to break the loop.
|
||||
|
||||
|
||||
Assertions
|
||||
----------
|
||||
|
||||
Forward assertions are just like other subpatterns, but starting with one of
|
||||
the opcodes OP_ASSERT or OP_ASSERT_NOT. Backward assertions use the opcodes
|
||||
OP_ASSERTBACK and OP_ASSERTBACK_NOT, and the first opcode inside the assertion
|
||||
is OP_REVERSE, followed by a two byte count of the number of characters to move
|
||||
back the pointer in the subject string. When operating in UTF-8 mode, the count
|
||||
is a character count rather than a byte count. A separate count is present in
|
||||
each alternative of a lookbehind assertion, allowing them to have different
|
||||
fixed lengths.
|
||||
|
||||
|
||||
Once-only (atomic) subpatterns
|
||||
------------------------------
|
||||
|
||||
These are also just like other subpatterns, but they start with the opcode
|
||||
OP_ONCE. The check for matching an empty string in an unbounded repeat is
|
||||
handled entirely at runtime, so there is just this one opcode.
|
||||
|
||||
|
||||
Conditional subpatterns
|
||||
-----------------------
|
||||
|
||||
These are like other subpatterns, but they start with the opcode OP_COND, or
|
||||
OP_SCOND for one that might match an empty string in an unbounded repeat. If
|
||||
the condition is a back reference, this is stored at the start of the
|
||||
subpattern using the opcode OP_CREF followed by two bytes containing the
|
||||
reference number. If the condition is "in recursion" (coded as "(?(R)"), or "in
|
||||
recursion of group x" (coded as "(?(Rx)"), the group number is stored at the
|
||||
start of the subpattern using the opcode OP_RREF, and a value of zero for "the
|
||||
whole pattern". For a DEFINE condition, just the single byte OP_DEF is used (it
|
||||
has no associated data). Otherwise, a conditional subpattern always starts with
|
||||
one of the assertions.
|
||||
|
||||
|
||||
Recursion
|
||||
---------
|
||||
|
||||
Recursion either matches the current regex, or some subexpression. The opcode
|
||||
OP_RECURSE is followed by an value which is the offset to the starting bracket
|
||||
from the start of the whole pattern. From release 6.5, OP_RECURSE is
|
||||
automatically wrapped inside OP_ONCE brackets (because otherwise some patterns
|
||||
broke it). OP_RECURSE is also used for "subroutine" calls, even though they
|
||||
are not strictly a recursion.
|
||||
|
||||
|
||||
Callout
|
||||
-------
|
||||
|
||||
OP_CALLOUT is followed by one byte of data that holds a callout number in the
|
||||
range 0 to 254 for manual callouts, or 255 for an automatic callout. In both
|
||||
cases there follows a two-byte value giving the offset in the pattern to the
|
||||
start of the following item, and another two-byte item giving the length of the
|
||||
next item.
|
||||
|
||||
|
||||
Changing options
|
||||
----------------
|
||||
|
||||
If any of the /i, /m, or /s options are changed within a pattern, an OP_OPT
|
||||
opcode is compiled, followed by one byte containing the new settings of these
|
||||
flags. If there are several alternatives, there is an occurrence of OP_OPT at
|
||||
the start of all those following the first options change, to set appropriate
|
||||
options for the start of the alternative. Immediately after the end of the
|
||||
group there is another such item to reset the flags to their previous values. A
|
||||
change of flag right at the very start of the pattern can be handled entirely
|
||||
at compile time, and so does not cause anything to be put into the compiled
|
||||
data.
|
||||
|
||||
Philip Hazel
|
||||
April 2008
|
||||
+171
-66
@@ -1,41 +1,54 @@
|
||||
Installation Instructions
|
||||
*************************
|
||||
|
||||
Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005,
|
||||
2006, 2007, 2008 Free Software Foundation, Inc.
|
||||
|
||||
This file is free documentation; the Free Software Foundation gives
|
||||
unlimited permission to copy, distribute and modify it.
|
||||
|
||||
Basic Installation
|
||||
==================
|
||||
|
||||
These are generic installation instructions that apply to systems that
|
||||
can run the `configure' shell script - Unix systems and any that imitate
|
||||
it. They are not specific to PCRE. There are PCRE-specific instructions
|
||||
for non-Unix systems in the file NON-UNIX-USE.
|
||||
Briefly, the shell commands `./configure; make; make install' should
|
||||
configure, build, and install this package. The following
|
||||
more-detailed instructions are generic; see the `README' file for
|
||||
instructions specific to this package.
|
||||
|
||||
The `configure' shell script attempts to guess correct values for
|
||||
various system-dependent variables used during compilation. It uses
|
||||
those values to create a `Makefile' in each directory of the package.
|
||||
It may also create one or more `.h' files containing system-dependent
|
||||
definitions. Finally, it creates a shell script `config.status' that
|
||||
you can run in the future to recreate the current configuration, a file
|
||||
`config.cache' that saves the results of its tests to speed up
|
||||
reconfiguring, and a file `config.log' containing compiler output
|
||||
(useful mainly for debugging `configure').
|
||||
you can run in the future to recreate the current configuration, and a
|
||||
file `config.log' containing compiler output (useful mainly for
|
||||
debugging `configure').
|
||||
|
||||
It can also use an optional file (typically called `config.cache'
|
||||
and enabled with `--cache-file=config.cache' or simply `-C') that saves
|
||||
the results of its tests to speed up reconfiguring. Caching is
|
||||
disabled by default to prevent problems with accidental use of stale
|
||||
cache files.
|
||||
|
||||
If you need to do unusual things to compile the package, please try
|
||||
to figure out how `configure' could check whether to do them, and mail
|
||||
diffs or instructions to the address given in the `README' so they can
|
||||
be considered for the next release. If at some point `config.cache'
|
||||
contains results you don't want to keep, you may remove or edit it.
|
||||
be considered for the next release. If you are using the cache, and at
|
||||
some point `config.cache' contains results you don't want to keep, you
|
||||
may remove or edit it.
|
||||
|
||||
The file `configure.in' is used to create `configure' by a program
|
||||
called `autoconf'. You only need `configure.in' if you want to change
|
||||
it or regenerate `configure' using a newer version of `autoconf'.
|
||||
The file `configure.ac' (or `configure.in') is used to create
|
||||
`configure' by a program called `autoconf'. You need `configure.ac' if
|
||||
you want to change it or regenerate `configure' using a newer version
|
||||
of `autoconf'.
|
||||
|
||||
The simplest way to compile this package is:
|
||||
|
||||
1. `cd' to the directory containing the package's source code and type
|
||||
`./configure' to configure the package for your system. If you're
|
||||
using `csh' on an old version of System V, you might need to type
|
||||
`sh ./configure' instead to prevent `csh' from trying to execute
|
||||
`configure' itself.
|
||||
`./configure' to configure the package for your system.
|
||||
|
||||
Running `configure' takes awhile. While running, it prints some
|
||||
messages telling which features it is checking for.
|
||||
Running `configure' might take a while. While running, it prints
|
||||
some messages telling which features it is checking for.
|
||||
|
||||
2. Type `make' to compile the package.
|
||||
|
||||
@@ -54,52 +67,69 @@ The simplest way to compile this package is:
|
||||
all sorts of other programs in order to regenerate files that came
|
||||
with the distribution.
|
||||
|
||||
6. Often, you can also type `make uninstall' to remove the installed
|
||||
files again.
|
||||
|
||||
Compilers and Options
|
||||
=====================
|
||||
|
||||
Some systems require unusual options for compilation or linking that
|
||||
the `configure' script does not know about. You can give `configure'
|
||||
initial values for variables by setting them in the environment. Using
|
||||
a Bourne-compatible shell, you can do that on the command line like
|
||||
this:
|
||||
CC=c89 CFLAGS=-O2 LIBS=-lposix ./configure
|
||||
the `configure' script does not know about. Run `./configure --help'
|
||||
for details on some of the pertinent environment variables.
|
||||
|
||||
Or on systems that have the `env' program, you can do it like this:
|
||||
env CPPFLAGS=-I/usr/local/include LDFLAGS=-s ./configure
|
||||
You can give `configure' initial values for configuration parameters
|
||||
by setting variables in the command line or in the environment. Here
|
||||
is an example:
|
||||
|
||||
./configure CC=c99 CFLAGS=-g LIBS=-lposix
|
||||
|
||||
*Note Defining Variables::, for more details.
|
||||
|
||||
Compiling For Multiple Architectures
|
||||
====================================
|
||||
|
||||
You can compile the package for more than one kind of computer at the
|
||||
same time, by placing the object files for each architecture in their
|
||||
own directory. To do this, you must use a version of `make' that
|
||||
supports the `VPATH' variable, such as GNU `make'. `cd' to the
|
||||
own directory. To do this, you can use GNU `make'. `cd' to the
|
||||
directory where you want the object files and executables to go and run
|
||||
the `configure' script. `configure' automatically checks for the
|
||||
source code in the directory that `configure' is in and in `..'.
|
||||
|
||||
If you have to use a `make' that does not supports the `VPATH'
|
||||
variable, you have to compile the package for one architecture at a time
|
||||
in the source code directory. After you have installed the package for
|
||||
one architecture, use `make distclean' before reconfiguring for another
|
||||
architecture.
|
||||
With a non-GNU `make', it is safer to compile the package for one
|
||||
architecture at a time in the source code directory. After you have
|
||||
installed the package for one architecture, use `make distclean' before
|
||||
reconfiguring for another architecture.
|
||||
|
||||
On MacOS X 10.5 and later systems, you can create libraries and
|
||||
executables that work on multiple system types--known as "fat" or
|
||||
"universal" binaries--by specifying multiple `-arch' options to the
|
||||
compiler but only a single `-arch' option to the preprocessor. Like
|
||||
this:
|
||||
|
||||
./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
|
||||
CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
|
||||
CPP="gcc -E" CXXCPP="g++ -E"
|
||||
|
||||
This is not guaranteed to produce working output in all cases, you
|
||||
may have to build one architecture at a time and combine the results
|
||||
using the `lipo' tool if you have problems.
|
||||
|
||||
Installation Names
|
||||
==================
|
||||
|
||||
By default, `make install' will install the package's files in
|
||||
`/usr/local/bin', `/usr/local/man', etc. You can specify an
|
||||
installation prefix other than `/usr/local' by giving `configure' the
|
||||
option `--prefix=PATH'.
|
||||
By default, `make install' installs the package's commands under
|
||||
`/usr/local/bin', include files under `/usr/local/include', etc. You
|
||||
can specify an installation prefix other than `/usr/local' by giving
|
||||
`configure' the option `--prefix=PREFIX'.
|
||||
|
||||
You can specify separate installation prefixes for
|
||||
architecture-specific files and architecture-independent files. If you
|
||||
give `configure' the option `--exec-prefix=PATH', the package will use
|
||||
PATH as the prefix for installing programs and libraries.
|
||||
Documentation and other data files will still use the regular prefix.
|
||||
pass the option `--exec-prefix=PREFIX' to `configure', the package uses
|
||||
PREFIX as the prefix for installing programs and libraries.
|
||||
Documentation and other data files still use the regular prefix.
|
||||
|
||||
In addition, if you use an unusual directory layout you can give
|
||||
options like `--bindir=PATH' to specify different values for particular
|
||||
options like `--bindir=DIR' to specify different values for particular
|
||||
kinds of files. Run `configure --help' for a list of the directories
|
||||
you can set and what kinds of files go in them.
|
||||
|
||||
@@ -122,25 +152,57 @@ find the X include and library files automatically, but if it doesn't,
|
||||
you can use the `configure' options `--x-includes=DIR' and
|
||||
`--x-libraries=DIR' to specify their locations.
|
||||
|
||||
Particular systems
|
||||
==================
|
||||
|
||||
On HP-UX, the default C compiler is not ANSI C compatible. If GNU
|
||||
CC is not installed, it is recommended to use the following options in
|
||||
order to use an ANSI C compiler:
|
||||
|
||||
./configure CC="cc -Ae"
|
||||
|
||||
and if that doesn't work, install pre-built binaries of GCC for HP-UX.
|
||||
|
||||
On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot
|
||||
parse its `<wchar.h>' header file. The option `-nodtk' can be used as
|
||||
a workaround. If GNU CC is not installed, it is therefore recommended
|
||||
to try
|
||||
|
||||
./configure CC="cc"
|
||||
|
||||
and if that doesn't work, try
|
||||
|
||||
./configure CC="cc -nodtk"
|
||||
|
||||
Specifying the System Type
|
||||
==========================
|
||||
|
||||
There may be some features `configure' can not figure out
|
||||
automatically, but needs to determine by the type of host the package
|
||||
will run on. Usually `configure' can figure that out, but if it prints
|
||||
a message saying it can not guess the host type, give it the
|
||||
`--host=TYPE' option. TYPE can either be a short name for the system
|
||||
type, such as `sun4', or a canonical name with three fields:
|
||||
There may be some features `configure' cannot figure out
|
||||
automatically, but needs to determine by the type of machine the package
|
||||
will run on. Usually, assuming the package is built to be run on the
|
||||
_same_ architectures, `configure' can figure that out, but if it prints
|
||||
a message saying it cannot guess the machine type, give it the
|
||||
`--build=TYPE' option. TYPE can either be a short name for the system
|
||||
type, such as `sun4', or a canonical name which has the form:
|
||||
|
||||
CPU-COMPANY-SYSTEM
|
||||
|
||||
See the file `config.sub' for the possible values of each field. If
|
||||
`config.sub' isn't included in this package, then this package doesn't
|
||||
need to know the host type.
|
||||
where SYSTEM can have one of these forms:
|
||||
|
||||
If you are building compiler tools for cross-compiling, you can also
|
||||
use the `--target=TYPE' option to select the type of system they will
|
||||
produce code for and the `--build=TYPE' option to select the type of
|
||||
system on which you are compiling the package.
|
||||
OS KERNEL-OS
|
||||
|
||||
See the file `config.sub' for the possible values of each field. If
|
||||
`config.sub' isn't included in this package, then this package doesn't
|
||||
need to know the machine type.
|
||||
|
||||
If you are _building_ compiler tools for cross-compiling, you should
|
||||
use the option `--target=TYPE' to select the type of system they will
|
||||
produce code for.
|
||||
|
||||
If you want to _use_ a cross compiler, that generates code for a
|
||||
platform different from the build platform, you should specify the
|
||||
"host" platform (i.e., that on which the generated programs will
|
||||
eventually be run) with `--host=TYPE'.
|
||||
|
||||
Sharing Defaults
|
||||
================
|
||||
@@ -153,19 +215,55 @@ default values for variables like `CC', `cache_file', and `prefix'.
|
||||
`CONFIG_SITE' environment variable to the location of the site script.
|
||||
A warning: not all `configure' scripts look for a site script.
|
||||
|
||||
Operation Controls
|
||||
Defining Variables
|
||||
==================
|
||||
|
||||
Variables not defined in a site shell script can be set in the
|
||||
environment passed to `configure'. However, some packages may run
|
||||
configure again during the build, and the customized values of these
|
||||
variables may be lost. In order to avoid this problem, you should set
|
||||
them in the `configure' command line, using `VAR=value'. For example:
|
||||
|
||||
./configure CC=/usr/local2/bin/gcc
|
||||
|
||||
causes the specified `gcc' to be used as the C compiler (unless it is
|
||||
overridden in the site shell script).
|
||||
|
||||
Unfortunately, this technique does not work for `CONFIG_SHELL' due to
|
||||
an Autoconf bug. Until the bug is fixed you can use this workaround:
|
||||
|
||||
CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
|
||||
|
||||
`configure' Invocation
|
||||
======================
|
||||
|
||||
`configure' recognizes the following options to control how it
|
||||
operates.
|
||||
|
||||
`--cache-file=FILE'
|
||||
Use and save the results of the tests in FILE instead of
|
||||
`./config.cache'. Set FILE to `/dev/null' to disable caching, for
|
||||
debugging `configure'.
|
||||
|
||||
`--help'
|
||||
Print a summary of the options to `configure', and exit.
|
||||
`-h'
|
||||
Print a summary of all of the options to `configure', and exit.
|
||||
|
||||
`--help=short'
|
||||
`--help=recursive'
|
||||
Print a summary of the options unique to this package's
|
||||
`configure', and exit. The `short' variant lists options used
|
||||
only in the top level, while the `recursive' variant lists options
|
||||
also present in any nested packages.
|
||||
|
||||
`--version'
|
||||
`-V'
|
||||
Print the version of Autoconf used to generate the `configure'
|
||||
script, and exit.
|
||||
|
||||
`--cache-file=FILE'
|
||||
Enable the cache: use and save the results of the tests in FILE,
|
||||
traditionally `config.cache'. FILE defaults to `/dev/null' to
|
||||
disable caching.
|
||||
|
||||
`--config-cache'
|
||||
`-C'
|
||||
Alias for `--cache-file=config.cache'.
|
||||
|
||||
`--quiet'
|
||||
`--silent'
|
||||
@@ -178,9 +276,16 @@ operates.
|
||||
Look for the package's source code in directory DIR. Usually
|
||||
`configure' can determine that directory automatically.
|
||||
|
||||
`--version'
|
||||
Print the version of Autoconf used to generate the `configure'
|
||||
script, and exit.
|
||||
`--prefix=DIR'
|
||||
Use DIR as the installation prefix. *Note Installation Names::
|
||||
for more details, including other options available for fine-tuning
|
||||
the installation locations.
|
||||
|
||||
`--no-create'
|
||||
`-n'
|
||||
Run the configure checks, but stop before creating any output
|
||||
files.
|
||||
|
||||
`configure' also accepts some other, not widely useful, options. Run
|
||||
`configure --help' for more details.
|
||||
|
||||
`configure' also accepts some other, not widely useful, options.
|
||||
|
||||
+4
-4
@@ -4,7 +4,7 @@ PCRE LICENCE
|
||||
PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Release 6 of PCRE is distributed under the terms of the "BSD" licence, as
|
||||
Release 7 of PCRE is distributed under the terms of the "BSD" licence, as
|
||||
specified below. The documentation for PCRE, supplied in the "doc"
|
||||
directory, is distributed under the same terms as the software itself.
|
||||
|
||||
@@ -20,9 +20,9 @@ Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
|
||||
University of Cambridge Computing Service,
|
||||
Cambridge, England. Phone: +44 1223 334714.
|
||||
Cambridge, England.
|
||||
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
All rights reserved.
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ THE C++ WRAPPER FUNCTIONS
|
||||
|
||||
Contributed by: Google Inc.
|
||||
|
||||
Copyright (c) 2006, Google Inc.
|
||||
Copyright (c) 2007-2008, Google Inc.
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,390 @@
|
||||
## Process this file with automake to produce Makefile.in.
|
||||
|
||||
dist_doc_DATA = \
|
||||
doc/pcre.txt \
|
||||
doc/pcre-config.txt \
|
||||
doc/pcregrep.txt \
|
||||
doc/pcretest.txt \
|
||||
AUTHORS \
|
||||
COPYING \
|
||||
ChangeLog \
|
||||
LICENCE \
|
||||
NEWS \
|
||||
README
|
||||
|
||||
dist_html_DATA = \
|
||||
doc/html/index.html \
|
||||
doc/html/pcre.html \
|
||||
doc/html/pcre-config.html \
|
||||
doc/html/pcre_compile.html \
|
||||
doc/html/pcre_compile2.html \
|
||||
doc/html/pcre_config.html \
|
||||
doc/html/pcre_copy_named_substring.html \
|
||||
doc/html/pcre_copy_substring.html \
|
||||
doc/html/pcre_dfa_exec.html \
|
||||
doc/html/pcre_exec.html \
|
||||
doc/html/pcre_free_substring.html \
|
||||
doc/html/pcre_free_substring_list.html \
|
||||
doc/html/pcre_fullinfo.html \
|
||||
doc/html/pcre_get_named_substring.html \
|
||||
doc/html/pcre_get_stringnumber.html \
|
||||
doc/html/pcre_get_stringtable_entries.html \
|
||||
doc/html/pcre_get_substring.html \
|
||||
doc/html/pcre_get_substring_list.html \
|
||||
doc/html/pcre_info.html \
|
||||
doc/html/pcre_maketables.html \
|
||||
doc/html/pcre_refcount.html \
|
||||
doc/html/pcre_study.html \
|
||||
doc/html/pcre_version.html \
|
||||
doc/html/pcreapi.html \
|
||||
doc/html/pcrebuild.html \
|
||||
doc/html/pcrecallout.html \
|
||||
doc/html/pcrecompat.html \
|
||||
doc/html/pcregrep.html \
|
||||
doc/html/pcrematching.html \
|
||||
doc/html/pcrepartial.html \
|
||||
doc/html/pcrepattern.html \
|
||||
doc/html/pcreperform.html \
|
||||
doc/html/pcreposix.html \
|
||||
doc/html/pcreprecompile.html \
|
||||
doc/html/pcresample.html \
|
||||
doc/html/pcrestack.html \
|
||||
doc/html/pcresyntax.html \
|
||||
doc/html/pcretest.html
|
||||
|
||||
pcrecpp_html = doc/html/pcrecpp.html
|
||||
dist_noinst_DATA = $(pcrecpp_html)
|
||||
|
||||
if WITH_PCRE_CPP
|
||||
html_DATA = $(pcrecpp_html)
|
||||
endif
|
||||
|
||||
# The Libtool libraries to install. We'll add to this later.
|
||||
lib_LTLIBRARIES =
|
||||
|
||||
# Unit tests you want to run when people type 'make check'.
|
||||
# TESTS is for binary unit tests, check_SCRIPTS for script-based tests
|
||||
TESTS =
|
||||
check_SCRIPTS =
|
||||
dist_noinst_SCRIPTS =
|
||||
|
||||
# Some of the binaries we make are to be installed, and others are
|
||||
# (non-user-visible) helper programs needed to build libpcre.
|
||||
bin_PROGRAMS =
|
||||
noinst_PROGRAMS =
|
||||
|
||||
# Additional files to delete on 'make clean' and 'make maintainer-clean'.
|
||||
CLEANFILES =
|
||||
MAINTAINERCLEANFILES =
|
||||
|
||||
# Additional files to bundle with the distribution, over and above what
|
||||
# the Autotools include by default.
|
||||
EXTRA_DIST =
|
||||
|
||||
# These files contain maintenance information
|
||||
EXTRA_DIST += \
|
||||
doc/perltest.txt \
|
||||
NON-UNIX-USE \
|
||||
HACKING
|
||||
|
||||
# These files are used in the preparation of a release
|
||||
EXTRA_DIST += \
|
||||
PrepareRelease \
|
||||
CleanTxt \
|
||||
Detrail \
|
||||
132html \
|
||||
doc/index.html.src
|
||||
|
||||
# These files are to do with building for Virtual Pascal
|
||||
EXTRA_DIST += \
|
||||
makevp.bat \
|
||||
makevp_c.txt \
|
||||
makevp_l.txt \
|
||||
pcregexp.pas
|
||||
|
||||
# These files are usable versions of pcre.h and config.h that are distributed
|
||||
# for the benefit of people who are building PCRE manually, without the
|
||||
# Autotools support.
|
||||
EXTRA_DIST += \
|
||||
pcre.h.generic \
|
||||
config.h.generic
|
||||
|
||||
pcre.h.generic: configure.ac
|
||||
rm -f $@
|
||||
cp -p pcre.h $@
|
||||
|
||||
MAINTAINERCLEANFILES += pcre.h.generic
|
||||
|
||||
# These are the header files we'll install. We do not distribute pcre.h because
|
||||
# it is generated from pcre.h.in.
|
||||
nodist_include_HEADERS = \
|
||||
pcre.h
|
||||
include_HEADERS = \
|
||||
pcreposix.h
|
||||
|
||||
# These additional headers will be be installed if C++ support is enabled. We
|
||||
# do not distribute pcrecpparg.h or pcre_stringpiece.h, as these are generated
|
||||
# from corresponding .h.in files (which we do distribute).
|
||||
if WITH_PCRE_CPP
|
||||
nodist_include_HEADERS += \
|
||||
pcrecpparg.h \
|
||||
pcre_stringpiece.h
|
||||
include_HEADERS += \
|
||||
pcrecpp.h \
|
||||
pcre_scanner.h
|
||||
endif # WITH_PCRE_CPP
|
||||
|
||||
bin_SCRIPTS = pcre-config
|
||||
|
||||
## ---------------------------------------------------------------
|
||||
## The dftables program is used to rebuild character tables before compiling
|
||||
## PCRE, if --enable-rebuild-chartables is specified. It is not a user-visible
|
||||
## program. The default (when --enable-rebuild-chartables is not specified) is
|
||||
## to copy a distributed set of tables that are defined for ASCII code. In this
|
||||
## case, dftables is not needed.
|
||||
|
||||
if WITH_REBUILD_CHARTABLES
|
||||
|
||||
noinst_PROGRAMS += dftables
|
||||
dftables_SOURCES = dftables.c
|
||||
|
||||
pcre_chartables.c: dftables$(EXEEXT)
|
||||
./dftables$(EXEEXT) $@
|
||||
else
|
||||
|
||||
pcre_chartables.c: $(srcdir)/pcre_chartables.c.dist
|
||||
rm -f $@
|
||||
$(LN_S) $(srcdir)/pcre_chartables.c.dist $@
|
||||
|
||||
endif # WITH_REBUILD_CHARTABLES
|
||||
|
||||
|
||||
## The main pcre library
|
||||
lib_LTLIBRARIES += libpcre.la
|
||||
libpcre_la_SOURCES = \
|
||||
pcre_compile.c \
|
||||
pcre_config.c \
|
||||
pcre_dfa_exec.c \
|
||||
pcre_exec.c \
|
||||
pcre_fullinfo.c \
|
||||
pcre_get.c \
|
||||
pcre_globals.c \
|
||||
pcre_info.c \
|
||||
pcre_internal.h \
|
||||
pcre_maketables.c \
|
||||
pcre_newline.c \
|
||||
pcre_ord2utf8.c \
|
||||
pcre_refcount.c \
|
||||
pcre_study.c \
|
||||
pcre_tables.c \
|
||||
pcre_try_flipped.c \
|
||||
pcre_ucd.c \
|
||||
pcre_valid_utf8.c \
|
||||
pcre_version.c \
|
||||
pcre_xclass.c \
|
||||
ucp.h
|
||||
|
||||
## This file is generated as part of the building process, so don't distribute.
|
||||
nodist_libpcre_la_SOURCES = \
|
||||
pcre_chartables.c
|
||||
|
||||
# The pcre_printint.src file is #included by some source files, so it must be
|
||||
# distributed. The pcre_chartables.c.dist file is the default version of
|
||||
# pcre_chartables.c, used unless --enable-rebuild-chartables is specified.
|
||||
EXTRA_DIST += pcre_printint.src pcre_chartables.c.dist
|
||||
|
||||
libpcre_la_LDFLAGS = $(EXTRA_LIBPCRE_LDFLAGS)
|
||||
|
||||
CLEANFILES += pcre_chartables.c
|
||||
|
||||
## A version of the main pcre library that has a posix re API.
|
||||
lib_LTLIBRARIES += libpcreposix.la
|
||||
libpcreposix_la_SOURCES = \
|
||||
pcreposix.c
|
||||
libpcreposix_la_LDFLAGS = $(EXTRA_LIBPCREPOSIX_LDFLAGS)
|
||||
libpcreposix_la_LIBADD = libpcre.la
|
||||
|
||||
## There's a C++ library as well.
|
||||
if WITH_PCRE_CPP
|
||||
|
||||
lib_LTLIBRARIES += libpcrecpp.la
|
||||
libpcrecpp_la_SOURCES = \
|
||||
pcrecpp_internal.h \
|
||||
pcrecpp.cc \
|
||||
pcre_scanner.cc \
|
||||
pcre_stringpiece.cc
|
||||
libpcrecpp_la_LDFLAGS = $(EXTRA_LIBPCRECPP_LDFLAGS)
|
||||
libpcrecpp_la_LIBADD = libpcre.la
|
||||
|
||||
TESTS += pcrecpp_unittest
|
||||
noinst_PROGRAMS += pcrecpp_unittest
|
||||
pcrecpp_unittest_SOURCES = pcrecpp_unittest.cc
|
||||
pcrecpp_unittest_LDADD = libpcrecpp.la
|
||||
|
||||
TESTS += pcre_scanner_unittest
|
||||
noinst_PROGRAMS += pcre_scanner_unittest
|
||||
pcre_scanner_unittest_SOURCES = pcre_scanner_unittest.cc
|
||||
pcre_scanner_unittest_LDADD = libpcrecpp.la
|
||||
|
||||
TESTS += pcre_stringpiece_unittest
|
||||
noinst_PROGRAMS += pcre_stringpiece_unittest
|
||||
pcre_stringpiece_unittest_SOURCES = pcre_stringpiece_unittest.cc
|
||||
pcre_stringpiece_unittest_LDADD = libpcrecpp.la
|
||||
|
||||
endif # WITH_PCRE_CPP
|
||||
|
||||
## The main unit tests
|
||||
|
||||
# Each unit test is a binary plus a script that runs that binary in various
|
||||
# ways. We install these test binaries in case folks find it helpful.
|
||||
|
||||
TESTS += RunTest
|
||||
dist_noinst_SCRIPTS += RunTest
|
||||
EXTRA_DIST += RunTest.bat
|
||||
bin_PROGRAMS += pcretest
|
||||
pcretest_SOURCES = pcretest.c
|
||||
pcretest_LDADD = libpcreposix.la $(LIBREADLINE)
|
||||
|
||||
TESTS += RunGrepTest
|
||||
dist_noinst_SCRIPTS += RunGrepTest
|
||||
bin_PROGRAMS += pcregrep
|
||||
pcregrep_SOURCES = pcregrep.c
|
||||
pcregrep_LDADD = libpcreposix.la $(LIBZ) $(LIBBZ2)
|
||||
|
||||
EXTRA_DIST += \
|
||||
testdata/grepinput \
|
||||
testdata/grepinput8 \
|
||||
testdata/grepinputv \
|
||||
testdata/grepinputx \
|
||||
testdata/greplist \
|
||||
testdata/grepoutput \
|
||||
testdata/grepoutput8 \
|
||||
testdata/grepoutputN \
|
||||
testdata/testinput1 \
|
||||
testdata/testinput2 \
|
||||
testdata/testinput3 \
|
||||
testdata/testinput4 \
|
||||
testdata/testinput5 \
|
||||
testdata/testinput6 \
|
||||
testdata/testinput7 \
|
||||
testdata/testinput8 \
|
||||
testdata/testinput9 \
|
||||
testdata/testinput10 \
|
||||
testdata/testoutput1 \
|
||||
testdata/testoutput2 \
|
||||
testdata/testoutput3 \
|
||||
testdata/testoutput4 \
|
||||
testdata/testoutput5 \
|
||||
testdata/testoutput6 \
|
||||
testdata/testoutput7 \
|
||||
testdata/testoutput8 \
|
||||
testdata/testoutput9 \
|
||||
testdata/testoutput10 \
|
||||
testdata/wintestinput3 \
|
||||
testdata/wintestoutput3 \
|
||||
perltest.pl
|
||||
|
||||
CLEANFILES += \
|
||||
testsavedregex \
|
||||
teststderr \
|
||||
testtry \
|
||||
testNinput
|
||||
|
||||
|
||||
# PCRE demonstration program. No longer built automatcally. The point is that
|
||||
# the users should build it themselves. So just distribute the source.
|
||||
# noinst_PROGRAMS += pcredemo
|
||||
# pcredemo_SOURCES = pcredemo.c
|
||||
# pcredemo_LDADD = libpcre.la
|
||||
|
||||
EXTRA_DIST += pcredemo.c
|
||||
|
||||
|
||||
## Utility rules, documentation, etc.
|
||||
|
||||
# A compatibility line, the old build system worked with 'make test'
|
||||
test: check ;
|
||||
|
||||
|
||||
# A PCRE user submitted the following addition, saying that it "will allow
|
||||
# anyone using the 'mingw32' compiler to simply type 'make pcre.dll' and get a
|
||||
# nice DLL for Windows use". (It is used by the pcre.dll target.)
|
||||
DLL_OBJS= pcre_compile.o pcre_config.o \
|
||||
pcre_dfa_exec.o pcre_exec.o pcre_fullinfo.o pcre_get.o \
|
||||
pcre_globals.o pcre_info.o pcre_maketables.o \
|
||||
pcre_newline.o pcre_ord2utf8.o pcre_refcount.o \
|
||||
pcre_study.o pcre_tables.o pcre_try_flipped.o \
|
||||
pcre_ucd.o pcre_valid_utf8.o pcre_version.o \
|
||||
pcre_chartables.o \
|
||||
pcre_xclass.o
|
||||
|
||||
# A PCRE user submitted the following addition, saying that it "will allow
|
||||
# anyone using the 'mingw32' compiler to simply type 'make pcre.dll' and get a
|
||||
# nice DLL for Windows use".
|
||||
pcre.dll: $(DLL_OBJS)
|
||||
$(CC) -shared -o pcre.dll -Wl,"--strip-all" -Wl,"--export-all-symbols" $(DLL_OBJS)
|
||||
|
||||
|
||||
# We have .pc files for pkg-config users.
|
||||
pkgconfigdir = $(libdir)/pkgconfig
|
||||
pkgconfig_DATA = libpcre.pc
|
||||
if WITH_PCRE_CPP
|
||||
pkgconfig_DATA += libpcrecpp.pc
|
||||
endif
|
||||
|
||||
dist_man_MANS = \
|
||||
doc/pcre.3 \
|
||||
doc/pcre-config.1 \
|
||||
doc/pcre_compile.3 \
|
||||
doc/pcre_compile2.3 \
|
||||
doc/pcre_config.3 \
|
||||
doc/pcre_copy_named_substring.3 \
|
||||
doc/pcre_copy_substring.3 \
|
||||
doc/pcre_dfa_exec.3 \
|
||||
doc/pcre_exec.3 \
|
||||
doc/pcre_free_substring.3 \
|
||||
doc/pcre_free_substring_list.3 \
|
||||
doc/pcre_fullinfo.3 \
|
||||
doc/pcre_get_named_substring.3 \
|
||||
doc/pcre_get_stringnumber.3 \
|
||||
doc/pcre_get_stringtable_entries.3 \
|
||||
doc/pcre_get_substring.3 \
|
||||
doc/pcre_get_substring_list.3 \
|
||||
doc/pcre_info.3 \
|
||||
doc/pcre_maketables.3 \
|
||||
doc/pcre_refcount.3 \
|
||||
doc/pcre_study.3 \
|
||||
doc/pcre_version.3 \
|
||||
doc/pcreapi.3 \
|
||||
doc/pcrebuild.3 \
|
||||
doc/pcrecallout.3 \
|
||||
doc/pcrecompat.3 \
|
||||
doc/pcregrep.1 \
|
||||
doc/pcrematching.3 \
|
||||
doc/pcrepartial.3 \
|
||||
doc/pcrepattern.3 \
|
||||
doc/pcreperform.3 \
|
||||
doc/pcreposix.3 \
|
||||
doc/pcreprecompile.3 \
|
||||
doc/pcresample.3 \
|
||||
doc/pcrestack.3 \
|
||||
doc/pcresyntax.3 \
|
||||
doc/pcretest.1
|
||||
|
||||
pcrecpp_man = doc/pcrecpp.3
|
||||
EXTRA_DIST += $(pcrecpp_man)
|
||||
|
||||
if WITH_PCRE_CPP
|
||||
man_MANS = $(pcrecpp_man)
|
||||
endif
|
||||
|
||||
## CMake support
|
||||
|
||||
EXTRA_DIST += \
|
||||
cmake/COPYING-CMAKE-SCRIPTS \
|
||||
cmake/FindPackageHandleStandardArgs.cmake \
|
||||
cmake/FindReadline.cmake \
|
||||
CMakeLists.txt \
|
||||
config-cmake.h.in
|
||||
|
||||
## end Makefile.am
|
||||
+1414
-590
File diff suppressed because it is too large
Load Diff
+162
@@ -1,6 +1,168 @@
|
||||
News about PCRE releases
|
||||
------------------------
|
||||
|
||||
Release 7.9 11-Apr-09
|
||||
---------------------
|
||||
|
||||
Mostly bugfixes and tidies with just a couple of minor functional additions.
|
||||
|
||||
|
||||
Release 7.8 05-Sep-08
|
||||
---------------------
|
||||
|
||||
More bug fixes, plus a performance improvement in Unicode character property
|
||||
lookup.
|
||||
|
||||
|
||||
Release 7.7 07-May-08
|
||||
---------------------
|
||||
|
||||
This is once again mainly a bug-fix release, but there are a couple of new
|
||||
features.
|
||||
|
||||
|
||||
Release 7.6 28-Jan-08
|
||||
---------------------
|
||||
|
||||
The main reason for having this release so soon after 7.5 is because it fixes a
|
||||
potential buffer overflow problem in pcre_compile() when run in UTF-8 mode. In
|
||||
addition, the CMake configuration files have been brought up to date.
|
||||
|
||||
|
||||
Release 7.5 10-Jan-08
|
||||
---------------------
|
||||
|
||||
This is mainly a bug-fix release. However the ability to link pcregrep with
|
||||
libz or libbz2 and the ability to link pcretest with libreadline have been
|
||||
added. Also the --line-offsets and --file-offsets options were added to
|
||||
pcregrep.
|
||||
|
||||
|
||||
Release 7.4 21-Sep-07
|
||||
---------------------
|
||||
|
||||
The only change of specification is the addition of options to control whether
|
||||
\R matches any Unicode line ending (the default) or just CR, LF, and CRLF.
|
||||
Otherwise, the changes are bug fixes and a refactoring to reduce the number of
|
||||
relocations needed in a shared library. There have also been some documentation
|
||||
updates, in particular, some more information about using CMake to build PCRE
|
||||
has been added to the NON-UNIX-USE file.
|
||||
|
||||
|
||||
Release 7.3 28-Aug-07
|
||||
---------------------
|
||||
|
||||
Most changes are bug fixes. Some that are not:
|
||||
|
||||
1. There is some support for Perl 5.10's experimental "backtracking control
|
||||
verbs" such as (*PRUNE).
|
||||
|
||||
2. UTF-8 checking is now as per RFC 3629 instead of RFC 2279; this is more
|
||||
restrictive in the strings it accepts.
|
||||
|
||||
3. Checking for potential integer overflow has been made more dynamic, and as a
|
||||
consequence there is no longer a hard limit on the size of a subpattern that
|
||||
has a limited repeat count.
|
||||
|
||||
4. When CRLF is a valid line-ending sequence, pcre_exec() and pcre_dfa_exec()
|
||||
no longer advance by two characters instead of one when an unanchored match
|
||||
fails at CRLF if there are explicit CR or LF matches within the pattern.
|
||||
This gets rid of some anomalous effects that previously occurred.
|
||||
|
||||
5. Some PCRE-specific settings for varying the newline options at the start of
|
||||
a pattern have been added.
|
||||
|
||||
|
||||
Release 7.2 19-Jun-07
|
||||
---------------------
|
||||
|
||||
WARNING: saved patterns that were compiled by earlier versions of PCRE must be
|
||||
recompiled for use with 7.2 (necessitated by the addition of \K, \h, \H, \v,
|
||||
and \V).
|
||||
|
||||
Correction to the notes for 7.1: the note about shared libraries for Windows is
|
||||
wrong. Previously, three libraries were built, but each could function
|
||||
independently. For example, the pcreposix library also included all the
|
||||
functions from the basic pcre library. The change is that the three libraries
|
||||
are no longer independent. They are like the Unix libraries. To use the
|
||||
pcreposix functions, for example, you need to link with both the pcreposix and
|
||||
the basic pcre library.
|
||||
|
||||
Some more features from Perl 5.10 have been added:
|
||||
|
||||
(?-n) and (?+n) relative references for recursion and subroutines.
|
||||
|
||||
(?(-n) and (?(+n) relative references as conditions.
|
||||
|
||||
\k{name} and \g{name} are synonyms for \k<name>.
|
||||
|
||||
\K to reset the start of the matched string; for example, (foo)\Kbar
|
||||
matches bar preceded by foo, but only sets bar as the matched string.
|
||||
|
||||
(?| introduces a group where the capturing parentheses in each alternative
|
||||
start from the same number; for example, (?|(abc)|(xyz)) sets capturing
|
||||
parentheses number 1 in both cases.
|
||||
|
||||
\h, \H, \v, \V match horizontal and vertical whitespace, respectively.
|
||||
|
||||
|
||||
Release 7.1 24-Apr-07
|
||||
---------------------
|
||||
|
||||
There is only one new feature in this release: a linebreak setting of
|
||||
PCRE_NEWLINE_ANYCRLF. It is a cut-down version of PCRE_NEWLINE_ANY, which
|
||||
recognizes only CRLF, CR, and LF as linebreaks.
|
||||
|
||||
A few bugs are fixed (see ChangeLog for details), but the major change is a
|
||||
complete re-implementation of the build system. This now has full Autotools
|
||||
support and so is now "standard" in some sense. It should help with compiling
|
||||
PCRE in a wide variety of environments.
|
||||
|
||||
NOTE: when building shared libraries for Windows, three dlls are now built,
|
||||
called libpcre, libpcreposix, and libpcrecpp. Previously, everything was
|
||||
included in a single dll.
|
||||
|
||||
Another important change is that the dftables auxiliary program is no longer
|
||||
compiled and run at "make" time by default. Instead, a default set of character
|
||||
tables (assuming ASCII coding) is used. If you want to use dftables to generate
|
||||
the character tables as previously, add --enable-rebuild-chartables to the
|
||||
"configure" command. You must do this if you are compiling PCRE to run on a
|
||||
system that uses EBCDIC code.
|
||||
|
||||
There is a discussion about character tables in the README file. The default is
|
||||
not to use dftables so that that there is no problem when cross-compiling.
|
||||
|
||||
|
||||
Release 7.0 19-Dec-06
|
||||
---------------------
|
||||
|
||||
This release has a new major number because there have been some internal
|
||||
upheavals to facilitate the addition of new optimizations and other facilities,
|
||||
and to make subsequent maintenance and extension easier. Compilation is likely
|
||||
to be a bit slower, but there should be no major effect on runtime performance.
|
||||
Previously compiled patterns are NOT upwards compatible with this release. If
|
||||
you have saved compiled patterns from a previous release, you will have to
|
||||
re-compile them. Important changes that are visible to users are:
|
||||
|
||||
1. The Unicode property tables have been updated to Unicode 5.0.0, which adds
|
||||
some more scripts.
|
||||
|
||||
2. The option PCRE_NEWLINE_ANY causes PCRE to recognize any Unicode newline
|
||||
sequence as a newline.
|
||||
|
||||
3. The \R escape matches a single Unicode newline sequence as a single unit.
|
||||
|
||||
4. New features that will appear in Perl 5.10 are now in PCRE. These include
|
||||
alternative Perl syntax for named parentheses, and Perl syntax for
|
||||
recursion.
|
||||
|
||||
5. The C++ wrapper interface has been extended by the addition of a
|
||||
QuoteMeta function and the ability to allow copy construction and
|
||||
assignment.
|
||||
|
||||
For a complete list of changes, see the ChangeLog file.
|
||||
|
||||
|
||||
Release 6.7 04-Jul-06
|
||||
---------------------
|
||||
|
||||
|
||||
+334
-155
@@ -1,127 +1,154 @@
|
||||
Compiling PCRE on non-Unix systems
|
||||
----------------------------------
|
||||
|
||||
See below for comments on Cygwin or MinGW and OpenVMS usage. I (Philip Hazel)
|
||||
have no knowledge of Windows or VMS sytems and how their libraries work. The
|
||||
items in the PCRE Makefile that relate to anything other than Unix-like systems
|
||||
have been contributed by PCRE users. There are some other comments and files in
|
||||
the Contrib directory on the ftp site that you may find useful. See
|
||||
This document contains the following sections:
|
||||
|
||||
General
|
||||
Generic instructions for the PCRE C library
|
||||
The C++ wrapper functions
|
||||
Building for virtual Pascal
|
||||
Stack size in Windows environments
|
||||
Linking programs in Windows environments
|
||||
Comments about Win32 builds
|
||||
Building PCRE on Windows with CMake
|
||||
Use of relative paths with CMake on Windows
|
||||
Testing with runtest.bat
|
||||
Building under Windows with BCC5.5
|
||||
Building PCRE on OpenVMS
|
||||
|
||||
|
||||
GENERAL
|
||||
|
||||
I (Philip Hazel) have no experience of Windows or VMS sytems and how their
|
||||
libraries work. The items in the PCRE distribution and Makefile that relate to
|
||||
anything other than Unix-like systems are untested by me.
|
||||
|
||||
There are some other comments and files (including some documentation in CHM
|
||||
format) in the Contrib directory on the FTP site:
|
||||
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/Contrib
|
||||
|
||||
If you want to compile PCRE for a non-Unix system (or perhaps, more strictly,
|
||||
for a system that does not support "configure" and "make" files), note that
|
||||
the basic PCRE library consists entirely of code written in Standard C, and so
|
||||
should compile successfully on any system that has a Standard C compiler and
|
||||
library. The C++ wrapper functions are a separate issue (see below).
|
||||
If you want to compile PCRE for a non-Unix system (especially for a system that
|
||||
does not support "configure" and "make" files), note that the basic PCRE
|
||||
library consists entirely of code written in Standard C, and so should compile
|
||||
successfully on any system that has a Standard C compiler and library. The C++
|
||||
wrapper functions are a separate issue (see below).
|
||||
|
||||
The PCRE distribution includes a "configure" file for use by the Configure/Make
|
||||
build system, as found in many Unix-like environments. There is also support
|
||||
support for CMake, which some users prefer, in particular in Windows
|
||||
environments. There are some instructions for CMake under Windows in the
|
||||
section entitled "Building PCRE with CMake" below. CMake can also be used to
|
||||
build PCRE in Unix-like systems.
|
||||
|
||||
|
||||
GENERIC INSTRUCTIONS FOR THE C LIBRARY
|
||||
GENERIC INSTRUCTIONS FOR THE PCRE C LIBRARY
|
||||
|
||||
The following are generic comments about building PCRE. The interspersed
|
||||
indented commands are suggestions from Mark Tetrode as to which commands you
|
||||
might use on a Windows system to build a static library.
|
||||
The following are generic comments about building the PCRE C library "by hand".
|
||||
|
||||
(1) Copy or rename the file config.in as config.h, and change the macros that
|
||||
define HAVE_STRERROR and HAVE_MEMMOVE to define them as 1 rather than 0.
|
||||
Unfortunately, because of the way Unix autoconf works, the default setting has
|
||||
to be 0. You may also want to make changes to other macros in config.h. In
|
||||
particular, if you want to force a specific value for newline, you can define
|
||||
the NEWLINE macro. The default is to use '\n', thereby using whatever value
|
||||
your compiler gives to '\n'.
|
||||
(1) Copy or rename the file config.h.generic as config.h, and edit the macro
|
||||
settings that it contains to whatever is appropriate for your environment.
|
||||
In particular, if you want to force a specific value for newline, you can
|
||||
define the NEWLINE macro. When you compile any of the PCRE modules, you
|
||||
must specify -DHAVE_CONFIG_H to your compiler so that config.h is included
|
||||
in the sources.
|
||||
|
||||
rem Mark Tetrode's commands
|
||||
copy config.in config.h
|
||||
rem Use write, because notepad cannot handle UNIX files. Change values.
|
||||
write config.h
|
||||
An alternative approach is not to edit config.h, but to use -D on the
|
||||
compiler command line to make any changes that you need to the
|
||||
configuration options. In this case -DHAVE_CONFIG_H must not be set.
|
||||
|
||||
(2) Compile dftables.c as a stand-alone program, and then run it with
|
||||
the single argument "pcre_chartables.c". This generates a set of standard
|
||||
character tables and writes them to that file.
|
||||
NOTE: There have been occasions when the way in which certain parameters
|
||||
in config.h are used has changed between releases. (In the configure/make
|
||||
world, this is handled automatically.) When upgrading to a new release,
|
||||
you are strongly advised to review config.h.generic before re-using what
|
||||
you had previously.
|
||||
|
||||
rem Mark Tetrode's commands
|
||||
rem Compile & run
|
||||
cl -DSUPPORT_UTF8 -DSUPPORT_UCP dftables.c
|
||||
dftables.exe pcre_chartables.c
|
||||
(2) Copy or rename the file pcre.h.generic as pcre.h.
|
||||
|
||||
(3) Compile the following source files:
|
||||
(3) EITHER:
|
||||
Copy or rename file pcre_chartables.c.dist as pcre_chartables.c.
|
||||
|
||||
pcre_chartables.c
|
||||
pcre_compile.c
|
||||
pcre_config.c
|
||||
pcre_dfa_exec.c
|
||||
pcre_exec.c
|
||||
pcre_fullinfo.c
|
||||
pcre_get.c
|
||||
pcre_globals.c
|
||||
pcre_info.c
|
||||
pcre_maketables.c
|
||||
pcre_ord2utf8.c
|
||||
pcre_refcount.c
|
||||
pcre_study.c
|
||||
pcre_tables.c
|
||||
pcre_try_flipped.c
|
||||
pcre_ucp_searchfuncs.c
|
||||
pcre_valid_utf8.c
|
||||
pcre_version.c
|
||||
pcre_xclass.c
|
||||
OR:
|
||||
Compile dftables.c as a stand-alone program (using -DHAVE_CONFIG_H if
|
||||
you have set up config.h), and then run it with the single argument
|
||||
"pcre_chartables.c". This generates a set of standard character tables
|
||||
and writes them to that file. The tables are generated using the default
|
||||
C locale for your system. If you want to use a locale that is specified
|
||||
by LC_xxx environment variables, add the -L option to the dftables
|
||||
command. You must use this method if you are building on a system that
|
||||
uses EBCDIC code.
|
||||
|
||||
and link them all together into an object library in whichever form your system
|
||||
keeps such libraries. This is the pcre C library. If your system has static and
|
||||
shared libraries, you may have to do this once for each type.
|
||||
The tables in pcre_chartables.c are defaults. The caller of PCRE can
|
||||
specify alternative tables at run time.
|
||||
|
||||
rem These comments are out-of-date, referring to a previous release which
|
||||
rem had fewer source files. Replace with the file names from above.
|
||||
rem Mark Tetrode's commands, for a static library
|
||||
rem Compile & lib
|
||||
cl -DSUPPORT_UTF8 -DSUPPORT_UCP -DPOSIX_MALLOC_THRESHOLD=10 /c maketables.c get.c study.c pcre.c
|
||||
lib /OUT:pcre.lib maketables.obj get.obj study.obj pcre.obj
|
||||
(4) Ensure that you have the following header files:
|
||||
|
||||
(4) Similarly, compile pcreposix.c and link it (on its own) as the pcreposix
|
||||
library.
|
||||
pcre_internal.h
|
||||
ucp.h
|
||||
|
||||
rem Mark Tetrode's commands, for a static library
|
||||
rem Compile & lib
|
||||
cl -DSUPPORT_UTF8 -DSUPPORT_UCP -DPOSIX_MALLOC_THRESHOLD=10 /c pcreposix.c
|
||||
lib /OUT:pcreposix.lib pcreposix.obj
|
||||
(5) Also ensure that you have the following file, which is #included as source
|
||||
when building a debugging version of PCRE, and is also used by pcretest.
|
||||
|
||||
(5) Compile the test program pcretest.c. This needs the functions in the
|
||||
pcre and pcreposix libraries when linking.
|
||||
pcre_printint.src
|
||||
|
||||
rem Mark Tetrode's commands
|
||||
rem compile & link
|
||||
cl /F0x400000 pcretest.c pcre.lib pcreposix.lib
|
||||
(6) Compile the following source files, setting -DHAVE_CONFIG_H as a compiler
|
||||
option if you have set up config.h with your configuration, or else use
|
||||
other -D settings to change the configuration as required.
|
||||
|
||||
(6) Run pcretest on the testinput files in the testdata directory, and check
|
||||
that the output matches the corresponding testoutput files. You must use the
|
||||
-i option when checking testinput2. Note that the supplied files are in Unix
|
||||
format, with just LF characters as line terminators. You may need to edit them
|
||||
to change this if your system uses a different convention.
|
||||
pcre_chartables.c
|
||||
pcre_compile.c
|
||||
pcre_config.c
|
||||
pcre_dfa_exec.c
|
||||
pcre_exec.c
|
||||
pcre_fullinfo.c
|
||||
pcre_get.c
|
||||
pcre_globals.c
|
||||
pcre_info.c
|
||||
pcre_maketables.c
|
||||
pcre_newline.c
|
||||
pcre_ord2utf8.c
|
||||
pcre_refcount.c
|
||||
pcre_study.c
|
||||
pcre_tables.c
|
||||
pcre_try_flipped.c
|
||||
pcre_ucd.c
|
||||
pcre_valid_utf8.c
|
||||
pcre_version.c
|
||||
pcre_xclass.c
|
||||
|
||||
rem Mark Tetrode's commands
|
||||
pcretest testdata\testinput1 testdata\myoutput1
|
||||
windiff testdata\testoutput1 testdata\myoutput1
|
||||
pcretest -i testdata\testinput2 testdata\myoutput2
|
||||
windiff testdata\testoutput2 testdata\myoutput2
|
||||
pcretest testdata\testinput3 testdata\myoutput3
|
||||
windiff testdata\testoutput3 testdata\myoutput3
|
||||
pcretest testdata\testinput4 testdata\myoutput4
|
||||
windiff testdata\testoutput4 testdata\myoutput4
|
||||
pcretest testdata\testinput5 testdata\myoutput5
|
||||
windiff testdata\testoutput5 testdata\myoutput5
|
||||
pcretest testdata\testinput6 testdata\myoutput6
|
||||
windiff testdata\testoutput6 testdata\myoutput6
|
||||
Make sure that you include -I. in the compiler command (or equivalent for
|
||||
an unusual compiler) so that all included PCRE header files are first
|
||||
sought in the current directory. Otherwise you run the risk of picking up
|
||||
a previously-installed file from somewhere else.
|
||||
|
||||
Note that there are now three more tests (7, 8, 9) that did not exist when Mark
|
||||
wrote those comments. The test the new pcre_dfa_exec() function.
|
||||
(7) Now link all the compiled code into an object library in whichever form
|
||||
your system keeps such libraries. This is the basic PCRE C library. If
|
||||
your system has static and shared libraries, you may have to do this once
|
||||
for each type.
|
||||
|
||||
(7) If you want to use the pcregrep command, compile and link pcregrep.c; it
|
||||
uses only the basic PCRE library.
|
||||
(8) Similarly, compile pcreposix.c (remembering -DHAVE_CONFIG_H if necessary)
|
||||
and link the result (on its own) as the pcreposix library.
|
||||
|
||||
(9) Compile the test program pcretest.c (again, don't forget -DHAVE_CONFIG_H).
|
||||
This needs the functions in the pcre and pcreposix libraries when linking.
|
||||
It also needs the pcre_printint.src source file, which it #includes.
|
||||
|
||||
(10) Run pcretest on the testinput files in the testdata directory, and check
|
||||
that the output matches the corresponding testoutput files. Note that the
|
||||
supplied files are in Unix format, with just LF characters as line
|
||||
terminators. You may need to edit them to change this if your system uses
|
||||
a different convention. If you are using Windows, you probably should use
|
||||
the wintestinput3 file instead of testinput3 (and the corresponding output
|
||||
file). This is a locale test; wintestinput3 sets the locale to "french"
|
||||
rather than "fr_FR", and there some minor output differences.
|
||||
|
||||
(11) If you want to use the pcregrep command, compile and link pcregrep.c; it
|
||||
uses only the basic PCRE library (it does not need the pcreposix library).
|
||||
|
||||
|
||||
THE C++ WRAPPER FUNCTIONS
|
||||
|
||||
The PCRE distribution now contains some C++ wrapper functions and tests,
|
||||
The PCRE distribution also contains some C++ wrapper functions and tests,
|
||||
contributed by Google Inc. On a system that can use "configure" and "make",
|
||||
the functions are automatically built into a library called pcrecpp. It should
|
||||
be straightforward to compile the .cc files manually on other systems. The
|
||||
@@ -129,77 +156,228 @@ files called xxx_unittest.cc are test programs for each of the corresponding
|
||||
xxx.cc files.
|
||||
|
||||
|
||||
FURTHER REMARKS
|
||||
|
||||
If you have a system without "configure" but where you can use a Makefile, edit
|
||||
Makefile.in to create Makefile, substituting suitable values for the variables
|
||||
at the head of the file.
|
||||
|
||||
Some help in building a Win32 DLL of PCRE in GnuWin32 environments was
|
||||
contributed by Paul Sokolovsky. These environments are Mingw32
|
||||
(http://www.xraylith.wisc.edu/~khan/software/gnu-win32/) and CygWin
|
||||
(http://sourceware.cygnus.com/cygwin/). Paul comments:
|
||||
|
||||
For CygWin, set CFLAGS=-mno-cygwin, and do 'make dll'. You'll get
|
||||
pcre.dll (containing pcreposix also), libpcre.dll.a, and dynamically
|
||||
linked pgrep and pcretest. If you have /bin/sh, run RunTest (three
|
||||
main test go ok, locale not supported).
|
||||
|
||||
Changes to do MinGW with autoconf 2.50 were supplied by Fred Cox
|
||||
<sailorFred@yahoo.com>, who comments as follows:
|
||||
|
||||
If you are using the PCRE DLL, the normal Unix style configure && make &&
|
||||
make check && make install should just work[*]. If you want to statically
|
||||
link against the .a file, you must define PCRE_STATIC before including
|
||||
pcre.h, otherwise the pcre_malloc and pcre_free exported functions will be
|
||||
declared __declspec(dllimport), with hilarious results. See the configure.in
|
||||
and pcretest.c for how it is done for the static test.
|
||||
|
||||
Also, there will only be a libpcre.la, not a libpcreposix.la, as you
|
||||
would expect from the Unix version. The single DLL includes the pcreposix
|
||||
interface.
|
||||
|
||||
[*] But note that the supplied test files are in Unix format, with just LF
|
||||
characters as line terminators. You will have to edit them to change to CR LF
|
||||
terminators.
|
||||
BUILDING FOR VIRTUAL PASCAL
|
||||
|
||||
A script for building PCRE using Borland's C++ compiler for use with VPASCAL
|
||||
was contributed by Alexander Tokarev. It is called makevp.bat.
|
||||
was contributed by Alexander Tokarev. Stefan Weber updated the script and added
|
||||
additional files. The following files in the distribution are for building PCRE
|
||||
for use with VP/Borland: makevp_c.txt, makevp_l.txt, makevp.bat, pcregexp.pas.
|
||||
|
||||
These are some further comments about Win32 builds from Mark Evans. They
|
||||
were contributed before Fred Cox's changes were made, so it is possible that
|
||||
they may no longer be relevant.
|
||||
|
||||
"The documentation for Win32 builds is a bit shy. Under MSVC6 I
|
||||
followed their instructions to the letter, but there were still
|
||||
some things missing.
|
||||
STACK SIZE IN WINDOWS ENVIRONMENTS
|
||||
|
||||
(1) Must #define STATIC for entire project if linking statically.
|
||||
(I see no reason to use DLLs for code this compact.) This of
|
||||
course is a project setting in MSVC under Preprocessor.
|
||||
The default processor stack size of 1Mb in some Windows environments is too
|
||||
small for matching patterns that need much recursion. In particular, test 2 may
|
||||
fail because of this. Normally, running out of stack causes a crash, but there
|
||||
have been cases where the test program has just died silently. See your linker
|
||||
documentation for how to increase stack size if you experience problems. The
|
||||
Linux default of 8Mb is a reasonable choice for the stack, though even that can
|
||||
be too small for some pattern/subject combinations.
|
||||
|
||||
(2) Missing some #ifdefs relating to the function pointers
|
||||
pcre_malloc and pcre_free. See my solution below. (The stubs
|
||||
may not be mandatory but they made me feel better.)"
|
||||
PCRE has a compile configuration option to disable the use of stack for
|
||||
recursion so that heap is used instead. However, pattern matching is
|
||||
significantly slower when this is done. There is more about stack usage in the
|
||||
"pcrestack" documentation.
|
||||
|
||||
=========================
|
||||
#ifdef _WIN32
|
||||
#include <malloc.h>
|
||||
|
||||
void* malloc_stub(size_t N)
|
||||
{ return malloc(N); }
|
||||
void free_stub(void* p)
|
||||
{ free(p); }
|
||||
void *(*pcre_malloc)(size_t) = &malloc_stub;
|
||||
void (*pcre_free)(void *) = &free_stub;
|
||||
LINKING PROGRAMS IN WINDOWS ENVIRONMENTS
|
||||
|
||||
#else
|
||||
If you want to statically link a program against a PCRE library in the form of
|
||||
a non-dll .a file, you must define PCRE_STATIC before including pcre.h,
|
||||
otherwise the pcre_malloc() and pcre_free() exported functions will be declared
|
||||
__declspec(dllimport), with unwanted results.
|
||||
|
||||
void *(*pcre_malloc)(size_t) = malloc;
|
||||
void (*pcre_free)(void *) = free;
|
||||
|
||||
#endif
|
||||
=========================
|
||||
CALLING CONVENTIONS IN WINDOWS ENVIRONMENTS
|
||||
|
||||
It is possible to compile programs to use different calling conventions using
|
||||
MSVC. Search the web for "calling conventions" for more information. To make it
|
||||
easier to change the calling convention for the exported functions in the
|
||||
PCRE library, the macro PCRE_CALL_CONVENTION is present in all the external
|
||||
definitions. It can be set externally when compiling (e.g. in CFLAGS). If it is
|
||||
not set, it defaults to empty; the default calling convention is then used
|
||||
(which is what is wanted most of the time).
|
||||
|
||||
|
||||
COMMENTS ABOUT WIN32 BUILDS (see also "BUILDING PCRE WITH CMAKE" below)
|
||||
|
||||
There are two ways of building PCRE using the "configure, make, make install"
|
||||
paradigm on Windows systems: using MinGW or using Cygwin. These are not at all
|
||||
the same thing; they are completely different from each other. There is also
|
||||
support for building using CMake, which some users find a more straightforward
|
||||
way of building PCRE under Windows. However, the tests are not run
|
||||
automatically when CMake is used.
|
||||
|
||||
The MinGW home page (http://www.mingw.org/) says this:
|
||||
|
||||
MinGW: A collection of freely available and freely distributable Windows
|
||||
specific header files and import libraries combined with GNU toolsets that
|
||||
allow one to produce native Windows programs that do not rely on any
|
||||
3rd-party C runtime DLLs.
|
||||
|
||||
The Cygwin home page (http://www.cygwin.com/) says this:
|
||||
|
||||
Cygwin is a Linux-like environment for Windows. It consists of two parts:
|
||||
|
||||
. A DLL (cygwin1.dll) which acts as a Linux API emulation layer providing
|
||||
substantial Linux API functionality
|
||||
|
||||
. A collection of tools which provide Linux look and feel.
|
||||
|
||||
The Cygwin DLL currently works with all recent, commercially released x86 32
|
||||
bit and 64 bit versions of Windows, with the exception of Windows CE.
|
||||
|
||||
On both MinGW and Cygwin, PCRE should build correctly using:
|
||||
|
||||
./configure && make && make install
|
||||
|
||||
This should create two libraries called libpcre and libpcreposix, and, if you
|
||||
have enabled building the C++ wrapper, a third one called libpcrecpp. These are
|
||||
independent libraries: when you like with libpcreposix or libpcrecpp you must
|
||||
also link with libpcre, which contains the basic functions. (Some earlier
|
||||
releases of PCRE included the basic libpcre functions in libpcreposix. This no
|
||||
longer happens.)
|
||||
|
||||
A user submitted a special-purpose patch that makes it easy to create
|
||||
"pcre.dll" under mingw32 using the "msys" environment. It provides "pcre.dll"
|
||||
as a special target. If you use this target, no other files are built, and in
|
||||
particular, the pcretest and pcregrep programs are not built. An example of how
|
||||
this might be used is:
|
||||
|
||||
./configure --enable-utf --disable-cpp CFLAGS="-03 -s"; make pcre.dll
|
||||
|
||||
Using Cygwin's compiler generates libraries and executables that depend on
|
||||
cygwin1.dll. If a library that is generated this way is distributed,
|
||||
cygwin1.dll has to be distributed as well. Since cygwin1.dll is under the GPL
|
||||
licence, this forces not only PCRE to be under the GPL, but also the entire
|
||||
application. A distributor who wants to keep their own code proprietary must
|
||||
purchase an appropriate Cygwin licence.
|
||||
|
||||
MinGW has no such restrictions. The MinGW compiler generates a library or
|
||||
executable that can run standalone on Windows without any third party dll or
|
||||
licensing issues.
|
||||
|
||||
But there is more complication:
|
||||
|
||||
If a Cygwin user uses the -mno-cygwin Cygwin gcc flag, what that really does is
|
||||
to tell Cygwin's gcc to use the MinGW gcc. Cygwin's gcc is only acting as a
|
||||
front end to MinGW's gcc (if you install Cygwin's gcc, you get both Cygwin's
|
||||
gcc and MinGW's gcc). So, a user can:
|
||||
|
||||
. Build native binaries by using MinGW or by getting Cygwin and using
|
||||
-mno-cygwin.
|
||||
|
||||
. Build binaries that depend on cygwin1.dll by using Cygwin with the normal
|
||||
compiler flags.
|
||||
|
||||
The test files that are supplied with PCRE are in Unix format, with LF
|
||||
characters as line terminators. It may be necessary to change the line
|
||||
terminators in order to get some of the tests to work. We hope to improve
|
||||
things in this area in future.
|
||||
|
||||
|
||||
BUILDING PCRE ON WINDOWS WITH CMAKE
|
||||
|
||||
CMake is an alternative build facility that can be used instead of the
|
||||
traditional Unix "configure". CMake version 2.4.7 supports Borland makefiles,
|
||||
MinGW makefiles, MSYS makefiles, NMake makefiles, UNIX makefiles, Visual Studio
|
||||
6, Visual Studio 7, Visual Studio 8, and Watcom W8. The following instructions
|
||||
were contributed by a PCRE user.
|
||||
|
||||
1. Download CMake 2.4.7 or above from http://www.cmake.org/, install and ensure
|
||||
that cmake\bin is on your path.
|
||||
|
||||
2. Unzip (retaining folder structure) the PCRE source tree into a source
|
||||
directory such as C:\pcre.
|
||||
|
||||
3. Create a new, empty build directory: C:\pcre\build\
|
||||
|
||||
4. Run CMakeSetup from the Shell envirornment of your build tool, e.g., Msys
|
||||
for Msys/MinGW or Visual Studio Command Prompt for VC/VC++
|
||||
|
||||
5. Enter C:\pcre\pcre-xx and C:\pcre\build for the source and build
|
||||
directories, respectively
|
||||
|
||||
6. Hit the "Configure" button.
|
||||
|
||||
7. Select the particular IDE / build tool that you are using (Visual Studio,
|
||||
MSYS makefiles, MinGW makefiles, etc.)
|
||||
|
||||
8. The GUI will then list several configuration options. This is where you can
|
||||
enable UTF-8 support, etc.
|
||||
|
||||
9. Hit "Configure" again. The adjacent "OK" button should now be active.
|
||||
|
||||
10. Hit "OK".
|
||||
|
||||
11. The build directory should now contain a usable build system, be it a
|
||||
solution file for Visual Studio, makefiles for MinGW, etc.
|
||||
|
||||
|
||||
USE OF RELATIVE PATHS WITH CMAKE ON WINDOWS
|
||||
|
||||
A PCRE user comments as follows:
|
||||
|
||||
I thought that others may want to know the current state of
|
||||
CMAKE_USE_RELATIVE_PATHS support on Windows.
|
||||
|
||||
Here it is:
|
||||
-- AdditionalIncludeDirectories is only partially modified (only the
|
||||
first path - see below)
|
||||
-- Only some of the contained file paths are modified - shown below for
|
||||
pcre.vcproj
|
||||
-- It properly modifies
|
||||
|
||||
I am sure CMake people can fix that if they want to. Until then one will
|
||||
need to replace existing absolute paths in project files with relative
|
||||
paths manually (e.g. from VS) - relative to project file location. I did
|
||||
just that before being told to try CMAKE_USE_RELATIVE_PATHS. Not a big
|
||||
deal.
|
||||
|
||||
AdditionalIncludeDirectories="E:\builds\pcre\build;E:\builds\pcre\pcre-7.5;"
|
||||
AdditionalIncludeDirectories=".;E:\builds\pcre\pcre-7.5;"
|
||||
|
||||
RelativePath="pcre.h">
|
||||
RelativePath="pcre_chartables.c">
|
||||
RelativePath="pcre_chartables.c.rule">
|
||||
|
||||
|
||||
TESTING WITH RUNTEST.BAT
|
||||
|
||||
1. Copy RunTest.bat into the directory where pcretest.exe has been created.
|
||||
|
||||
2. Edit RunTest.bat and insert a line that indentifies the relative location of
|
||||
the pcre source, e.g.:
|
||||
|
||||
set srcdir=..\pcre-7.4-RC3
|
||||
|
||||
3. Run RunTest.bat from a command shell environment. Test outputs will
|
||||
automatically be compared to expected results, and discrepancies will
|
||||
identified in the console output.
|
||||
|
||||
4. To test pcrecpp, run pcrecpp_unittest.exe, pcre_stringpiece_unittest.exe and
|
||||
pcre_scanner_unittest.exe.
|
||||
|
||||
|
||||
BUILDING UNDER WINDOWS WITH BCC5.5
|
||||
|
||||
Michael Roy sent these comments about building PCRE under Windows with BCC5.5:
|
||||
|
||||
Some of the core BCC libraries have a version of PCRE from 1998 built in,
|
||||
which can lead to pcre_exec() giving an erroneous PCRE_ERROR_NULL from a
|
||||
version mismatch. I'm including an easy workaround below, if you'd like to
|
||||
include it in the non-unix instructions:
|
||||
|
||||
When linking a project with BCC5.5, pcre.lib must be included before any of
|
||||
the libraries cw32.lib, cw32i.lib, cw32mt.lib, and cw32mti.lib on the command
|
||||
line.
|
||||
|
||||
|
||||
BUILDING UNDER WINDOWS CE WITH VISUAL STUDIO 200x
|
||||
|
||||
Vincent Richomme sent a zip archive of files to help with this process. They
|
||||
can be found in the file "pcre-vsbuild.zip" in the Contrib directory of the FTP
|
||||
site.
|
||||
|
||||
|
||||
BUILDING PCRE ON OPENVMS
|
||||
@@ -266,4 +444,5 @@ $! Locale could not be set to fr
|
||||
$!
|
||||
=========================
|
||||
|
||||
Last Updated: 17 March 2009
|
||||
****
|
||||
|
||||
Executable
+214
@@ -0,0 +1,214 @@
|
||||
#/bin/sh
|
||||
|
||||
# Script to prepare the files for building a PCRE release. It does some
|
||||
# processing of the documentation, detrails files, and creates pcre.h.generic
|
||||
# and config.h.generic (for use by builders who can't run ./configure).
|
||||
|
||||
# You must run this script before runnning "make dist". It makes use of the
|
||||
# following files:
|
||||
|
||||
# 132html A Perl script that converts a .1 or .3 man page into HTML. It
|
||||
# is called from MakeRelease. It "knows" the relevant troff
|
||||
# constructs that are used in the PCRE man pages.
|
||||
|
||||
# CleanTxt A Perl script that cleans up the output of "nroff -man" by
|
||||
# removing backspaces and other redundant text so as to produce
|
||||
# a readable .txt file.
|
||||
|
||||
# Detrail A Perl script that removes trailing spaces from files.
|
||||
|
||||
# doc/index.html.src
|
||||
# A file that is copied as index.html into the doc/html directory
|
||||
# when the HTML documentation is built. It works like this so that
|
||||
# doc/html can be deleted and re-created from scratch.
|
||||
|
||||
|
||||
# First, sort out the documentation
|
||||
|
||||
cd doc
|
||||
echo Processing documentation
|
||||
|
||||
# Make Text form of the documentation. It needs some mangling to make it
|
||||
# tidy for online reading. Concatenate all the .3 stuff, but omit the
|
||||
# individual function pages.
|
||||
|
||||
cat <<End >pcre.txt
|
||||
-----------------------------------------------------------------------------
|
||||
This file contains a concatenation of the PCRE man pages, converted to plain
|
||||
text format for ease of searching with a text editor, or for use on systems
|
||||
that do not have a man page processor. The small individual files that give
|
||||
synopses of each function in the library have not been included. There are
|
||||
separate text files for the pcregrep and pcretest commands.
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
|
||||
End
|
||||
|
||||
echo "Making pcre.txt"
|
||||
for file in pcre pcrebuild pcrematching pcreapi pcrecallout pcrecompat \
|
||||
pcrepattern pcresyntax pcrepartial pcreprecompile \
|
||||
pcreperform pcreposix pcrecpp pcresample pcrestack ; do
|
||||
echo " Processing $file.3"
|
||||
nroff -c -man $file.3 >$file.rawtxt
|
||||
../CleanTxt <$file.rawtxt >>pcre.txt
|
||||
/bin/rm $file.rawtxt
|
||||
echo "------------------------------------------------------------------------------" >>pcre.txt
|
||||
if [ "$file" != "pcresample" ] ; then
|
||||
echo " " >>pcre.txt
|
||||
echo " " >>pcre.txt
|
||||
fi
|
||||
done
|
||||
|
||||
# The three commands
|
||||
for file in pcretest pcregrep pcre-config ; do
|
||||
echo Making $file.txt
|
||||
nroff -c -man $file.1 >$file.rawtxt
|
||||
../CleanTxt <$file.rawtxt >$file.txt
|
||||
/bin/rm $file.rawtxt
|
||||
done
|
||||
|
||||
|
||||
# Make HTML form of the documentation.
|
||||
|
||||
echo "Making HTML documentation"
|
||||
/bin/rm html/*
|
||||
cp index.html.src html/index.html
|
||||
|
||||
for file in *.1 ; do
|
||||
base=`basename $file .1`
|
||||
echo " Making $base.html"
|
||||
../132html -toc $base <$file >html/$base.html
|
||||
done
|
||||
|
||||
# Exclude table of contents for function summaries. It seems that expr
|
||||
# forces an anchored regex. Also exclude them for small pages that have
|
||||
# only one section.
|
||||
for file in *.3 ; do
|
||||
base=`basename $file .3`
|
||||
toc=-toc
|
||||
if [ `expr $base : '.*_'` -ne 0 ] ; then toc="" ; fi
|
||||
if [ "$base" = "pcresample" ] || \
|
||||
[ "$base" = "pcrestack" ] || \
|
||||
[ "$base" = "pcrecompat" ] || \
|
||||
[ "$base" = "pcreperform" ] ; then
|
||||
toc=""
|
||||
fi
|
||||
echo " Making $base.html"
|
||||
../132html $toc $base <$file >html/$base.html
|
||||
if [ $? != 0 ] ; then exit 1; fi
|
||||
done
|
||||
|
||||
# End of documentation processing
|
||||
|
||||
cd ..
|
||||
echo Documentation done
|
||||
|
||||
# These files are detrailed; do not detrail the test data because there may be
|
||||
# significant trailing spaces. The configure files are also omitted from the
|
||||
# detrailing.
|
||||
|
||||
files="\
|
||||
Makefile.am \
|
||||
Makefile.in \
|
||||
configure.ac \
|
||||
README \
|
||||
LICENCE \
|
||||
COPYING \
|
||||
AUTHORS \
|
||||
NEWS \
|
||||
NON-UNIX-USE \
|
||||
INSTALL \
|
||||
132html \
|
||||
CleanTxt \
|
||||
Detrail \
|
||||
ChangeLog \
|
||||
CMakeLists.txt \
|
||||
RunGrepTest \
|
||||
RunTest \
|
||||
RunTest.bat \
|
||||
pcre-config.in \
|
||||
libpcre.pc.in \
|
||||
libpcrecpp.pc.in \
|
||||
config.h.in \
|
||||
pcre_printint.src \
|
||||
pcre_chartables.c.dist \
|
||||
pcredemo.c \
|
||||
pcregrep.c \
|
||||
pcretest.c \
|
||||
dftables.c \
|
||||
pcreposix.c \
|
||||
pcreposix.h \
|
||||
pcre.h.in \
|
||||
pcre_internal.h
|
||||
pcre_compile.c \
|
||||
pcre_config.c \
|
||||
pcre_dfa_exec.c \
|
||||
pcre_exec.c \
|
||||
pcre_fullinfo.c \
|
||||
pcre_get.c \
|
||||
pcre_globals.c \
|
||||
pcre_info.c \
|
||||
pcre_maketables.c \
|
||||
pcre_newline.c \
|
||||
pcre_ord2utf8.c \
|
||||
pcre_refcount.c \
|
||||
pcre_study.c \
|
||||
pcre_tables.c \
|
||||
pcre_try_flipped.c \
|
||||
pcre_ucp_searchfuncs.c \
|
||||
pcre_valid_utf8.c \
|
||||
pcre_version.c \
|
||||
pcre_xclass.c \
|
||||
pcre_scanner.cc \
|
||||
pcre_scanner.h \
|
||||
pcre_scanner_unittest.cc \
|
||||
pcrecpp.cc \
|
||||
pcrecpp.h \
|
||||
pcrecpparg.h.in \
|
||||
pcrecpp_unittest.cc \
|
||||
pcre_stringpiece.cc \
|
||||
pcre_stringpiece.h.in \
|
||||
pcre_stringpiece_unittest.cc \
|
||||
perltest.pl \
|
||||
ucp.h \
|
||||
ucpinternal.h \
|
||||
ucptable.h \
|
||||
makevp.bat \
|
||||
pcre.def \
|
||||
libpcre.def \
|
||||
libpcreposix.def"
|
||||
|
||||
echo Detrailing
|
||||
./Detrail $files doc/p* doc/html/*
|
||||
|
||||
echo Doing basic configure to get default pcre.h and config.h
|
||||
# This is in case the caller has set aliases (as I do - PH)
|
||||
unset cp ls mv rm
|
||||
./configure >/dev/null
|
||||
|
||||
echo Converting pcre.h and config.h to generic forms
|
||||
cp -f pcre.h pcre.h.generic
|
||||
|
||||
perl <<'END'
|
||||
open(IN, "<config.h") || die "Can't open config.h: $!\n";
|
||||
open(OUT, ">config.h.generic") || die "Can't open config.h.generic: $!\n";
|
||||
while (<IN>)
|
||||
{
|
||||
if (/^#define\s(?!PACKAGE)(\w+)/)
|
||||
{
|
||||
print OUT "#ifndef $1\n";
|
||||
print OUT;
|
||||
print OUT "#endif\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
print OUT;
|
||||
}
|
||||
}
|
||||
close IN;
|
||||
close OUT;
|
||||
END
|
||||
|
||||
echo Done
|
||||
|
||||
#End
|
||||
+455
-216
@@ -1,55 +1,93 @@
|
||||
README file for PCRE (Perl-compatible regular expression library)
|
||||
-----------------------------------------------------------------
|
||||
|
||||
The latest release of PCRE is always available from
|
||||
The latest release of PCRE is always available in three alternative formats
|
||||
from:
|
||||
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-xxx.tar.gz
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-xxx.tar.bz2
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-xxx.zip
|
||||
|
||||
There is a mailing list for discussion about the development of PCRE at
|
||||
|
||||
pcre-dev@exim.org
|
||||
|
||||
Please read the NEWS file if you are upgrading from a previous release.
|
||||
The contents of this README file are:
|
||||
|
||||
The PCRE APIs
|
||||
Documentation for PCRE
|
||||
Contributions by users of PCRE
|
||||
Building PCRE on non-Unix systems
|
||||
Building PCRE on Unix-like systems
|
||||
Retrieving configuration information on Unix-like systems
|
||||
Shared libraries on Unix-like systems
|
||||
Cross-compiling on Unix-like systems
|
||||
Using HP's ANSI C++ compiler (aCC)
|
||||
Making new tarballs
|
||||
Testing PCRE
|
||||
Character tables
|
||||
File manifest
|
||||
|
||||
|
||||
The PCRE APIs
|
||||
-------------
|
||||
|
||||
PCRE is written in C, and it has its own API. The distribution now includes a
|
||||
set of C++ wrapper functions, courtesy of Google Inc. (see the pcrecpp man page
|
||||
for details).
|
||||
PCRE is written in C, and it has its own API. The distribution also includes a
|
||||
set of C++ wrapper functions (see the pcrecpp man page for details), courtesy
|
||||
of Google Inc.
|
||||
|
||||
Also included are a set of C wrapper functions that are based on the POSIX
|
||||
API. These end up in the library called libpcreposix. Note that this just
|
||||
provides a POSIX calling interface to PCRE: the regular expressions themselves
|
||||
still follow Perl syntax and semantics. The header file for the POSIX-style
|
||||
functions is called pcreposix.h. The official POSIX name is regex.h, but I
|
||||
didn't want to risk possible problems with existing files of that name by
|
||||
distributing it that way. To use it with an existing program that uses the
|
||||
POSIX API, it will have to be renamed or pointed at by a link.
|
||||
In addition, there is a set of C wrapper functions that are based on the POSIX
|
||||
regular expression API (see the pcreposix man page). These end up in the
|
||||
library called libpcreposix. Note that this just provides a POSIX calling
|
||||
interface to PCRE; the regular expressions themselves still follow Perl syntax
|
||||
and semantics. The POSIX API is restricted, and does not give full access to
|
||||
all of PCRE's facilities.
|
||||
|
||||
The header file for the POSIX-style functions is called pcreposix.h. The
|
||||
official POSIX name is regex.h, but I did not want to risk possible problems
|
||||
with existing files of that name by distributing it that way. To use PCRE with
|
||||
an existing program that uses the POSIX API, pcreposix.h will have to be
|
||||
renamed or pointed at by a link.
|
||||
|
||||
If you are using the POSIX interface to PCRE and there is already a POSIX regex
|
||||
library installed on your system, you must take care when linking programs to
|
||||
library installed on your system, as well as worrying about the regex.h header
|
||||
file (as mentioned above), you must also take care when linking programs to
|
||||
ensure that they link with PCRE's libpcreposix library. Otherwise they may pick
|
||||
up the "real" POSIX functions of the same name.
|
||||
up the POSIX functions of the same name from the other library.
|
||||
|
||||
One way of avoiding this confusion is to compile PCRE with the addition of
|
||||
-Dregcomp=PCREregcomp (and similarly for the other POSIX functions) to the
|
||||
compiler flags (CFLAGS if you are using "configure" -- see below). This has the
|
||||
effect of renaming the functions so that the names no longer clash. Of course,
|
||||
you have to do the same thing for your applications, or write them using the
|
||||
new names.
|
||||
|
||||
|
||||
Documentation for PCRE
|
||||
----------------------
|
||||
|
||||
If you install PCRE in the normal way, you will end up with an installed set of
|
||||
man pages whose names all start with "pcre". The one that is just called "pcre"
|
||||
lists all the others. In addition to these man pages, the PCRE documentation is
|
||||
supplied in two other forms; however, as there is no standard place to install
|
||||
them, they are left in the doc directory of the unpacked source distribution.
|
||||
These forms are:
|
||||
If you install PCRE in the normal way on a Unix-like system, you will end up
|
||||
with a set of man pages whose names all start with "pcre". The one that is just
|
||||
called "pcre" lists all the others. In addition to these man pages, the PCRE
|
||||
documentation is supplied in two other forms:
|
||||
|
||||
1. Files called doc/pcre.txt, doc/pcregrep.txt, and doc/pcretest.txt. The
|
||||
first of these is a concatenation of the text forms of all the section 3
|
||||
man pages except those that summarize individual functions. The other two
|
||||
are the text forms of the section 1 man pages for the pcregrep and
|
||||
pcretest commands. Text forms are provided for ease of scanning with text
|
||||
editors or similar tools.
|
||||
1. There are files called doc/pcre.txt, doc/pcregrep.txt, and
|
||||
doc/pcretest.txt in the source distribution. The first of these is a
|
||||
concatenation of the text forms of all the section 3 man pages except
|
||||
those that summarize individual functions. The other two are the text
|
||||
forms of the section 1 man pages for the pcregrep and pcretest commands.
|
||||
These text forms are provided for ease of scanning with text editors or
|
||||
similar tools. They are installed in <prefix>/share/doc/pcre, where
|
||||
<prefix> is the installation prefix (defaulting to /usr/local).
|
||||
|
||||
2. A subdirectory called doc/html contains all the documentation in HTML
|
||||
form, hyperlinked in various ways, and rooted in a file called
|
||||
doc/index.html.
|
||||
2. A set of files containing all the documentation in HTML form, hyperlinked
|
||||
in various ways, and rooted in a file called index.html, is distributed in
|
||||
doc/html and installed in <prefix>/share/doc/pcre/html.
|
||||
|
||||
Users of PCRE have contributed files containing the documentation for various
|
||||
releases in CHM format. These can be found in the Contrib directory of the FTP
|
||||
site (see next section).
|
||||
|
||||
|
||||
Contributions by users of PCRE
|
||||
@@ -59,27 +97,48 @@ You can find contributions from PCRE users in the directory
|
||||
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/Contrib
|
||||
|
||||
where there is also a README file giving brief descriptions of what they are.
|
||||
Several of them provide support for compiling PCRE on various flavours of
|
||||
Windows systems (I myself do not use Windows). Some are complete in themselves;
|
||||
others are pointers to URLs containing relevant files.
|
||||
There is a README file giving brief descriptions of what they are. Some are
|
||||
complete in themselves; others are pointers to URLs containing relevant files.
|
||||
Some of this material is likely to be well out-of-date. Several of the earlier
|
||||
contributions provided support for compiling PCRE on various flavours of
|
||||
Windows (I myself do not use Windows). Nowadays there is more Windows support
|
||||
in the standard distribution, so these contibutions have been archived.
|
||||
|
||||
|
||||
Building PCRE on a Unix-like system
|
||||
-----------------------------------
|
||||
Building PCRE on non-Unix systems
|
||||
---------------------------------
|
||||
|
||||
For a non-Unix system, please read the comments in the file NON-UNIX-USE,
|
||||
though if your system supports the use of "configure" and "make" you may be
|
||||
able to build PCRE in the same way as for Unix-like systems. PCRE can also be
|
||||
configured in many platform environments using the GUI facility of CMake's
|
||||
CMakeSetup. It creates Makefiles, solution files, etc.
|
||||
|
||||
PCRE has been compiled on many different operating systems. It should be
|
||||
straightforward to build PCRE on any system that has a Standard C compiler and
|
||||
library, because it uses only Standard C functions.
|
||||
|
||||
|
||||
Building PCRE on Unix-like systems
|
||||
----------------------------------
|
||||
|
||||
If you are using HP's ANSI C++ compiler (aCC), please see the special note
|
||||
in the section entitled "Using HP's ANSI C++ compiler (aCC)" below.
|
||||
|
||||
The following instructions assume the use of the widely used "configure, make,
|
||||
make install" process. There is also support for CMake in the PCRE
|
||||
distribution; there are some comments about using CMake in the NON-UNIX-USE
|
||||
file, though it can also be used in Unix-like systems.
|
||||
|
||||
To build PCRE on a Unix-like system, first run the "configure" command from the
|
||||
PCRE distribution directory, with your current directory set to the directory
|
||||
where you want the files to be created. This command is a standard GNU
|
||||
"autoconf" configuration script, for which generic instructions are supplied in
|
||||
INSTALL.
|
||||
the file INSTALL.
|
||||
|
||||
Most commonly, people build PCRE within its own distribution directory, and in
|
||||
this case, on many systems, just running "./configure" is sufficient, but the
|
||||
usual methods of changing standard defaults are available. For example:
|
||||
this case, on many systems, just running "./configure" is sufficient. However,
|
||||
the usual methods of changing standard defaults are available. For example:
|
||||
|
||||
CFLAGS='-O2 -Wall' ./configure --prefix=/opt/local
|
||||
|
||||
@@ -103,13 +162,16 @@ library. You can read more about them in the pcrebuild man page.
|
||||
|
||||
. If you want to suppress the building of the C++ wrapper library, you can add
|
||||
--disable-cpp to the "configure" command. Otherwise, when "configure" is run,
|
||||
will try to find a C++ compiler and C++ header files, and if it succeeds, it
|
||||
will try to build the C++ wrapper.
|
||||
it will try to find a C++ compiler and C++ header files, and if it succeeds,
|
||||
it will try to build the C++ wrapper.
|
||||
|
||||
. If you want to make use of the support for UTF-8 character strings in PCRE,
|
||||
you must add --enable-utf8 to the "configure" command. Without it, the code
|
||||
for handling UTF-8 is not included in the library. (Even when included, it
|
||||
still has to be enabled by an option at run time.)
|
||||
. If you want to make use of the support for UTF-8 Unicode character strings in
|
||||
PCRE, you must add --enable-utf8 to the "configure" command. Without it, the
|
||||
code for handling UTF-8 is not included in the library. Even when included,
|
||||
it still has to be enabled by an option at run time. When PCRE is compiled
|
||||
with this option, its input can only either be ASCII or UTF-8, even when
|
||||
running on EBCDIC platforms. It is not possible to use both --enable-utf8 and
|
||||
--enable-ebcdic at the same time.
|
||||
|
||||
. If, in addition to support for UTF-8 character strings, you want to include
|
||||
support for the \P, \p, and \X sequences that recognize Unicode character
|
||||
@@ -118,17 +180,31 @@ library. You can read more about them in the pcrebuild man page.
|
||||
property table); only the basic two-letter properties such as Lu are
|
||||
supported.
|
||||
|
||||
. You can build PCRE to recognize either CR or LF or the sequence CRLF as
|
||||
indicating the end of a line. Whatever you specify at build time is the
|
||||
default; the caller of PCRE can change the selection at run time. The default
|
||||
newline indicator is a single LF character (the Unix standard). You can
|
||||
specify the default newline indicator by adding --newline-is-cr or
|
||||
--newline-is-lf or --newline-is-crlf to the "configure" command,
|
||||
respectively.
|
||||
. You can build PCRE to recognize either CR or LF or the sequence CRLF or any
|
||||
of the preceding, or any of the Unicode newline sequences as indicating the
|
||||
end of a line. Whatever you specify at build time is the default; the caller
|
||||
of PCRE can change the selection at run time. The default newline indicator
|
||||
is a single LF character (the Unix standard). You can specify the default
|
||||
newline indicator by adding --enable-newline-is-cr or --enable-newline-is-lf
|
||||
or --enable-newline-is-crlf or --enable-newline-is-anycrlf or
|
||||
--enable-newline-is-any to the "configure" command, respectively.
|
||||
|
||||
If you specify --enable-newline-is-cr or --enable-newline-is-crlf, some of
|
||||
the standard tests will fail, because the lines in the test files end with
|
||||
LF. Even if the files are edited to change the line endings, there are likely
|
||||
to be some failures. With --enable-newline-is-anycrlf or
|
||||
--enable-newline-is-any, many tests should succeed, but there may be some
|
||||
failures.
|
||||
|
||||
. By default, the sequence \R in a pattern matches any Unicode line ending
|
||||
sequence. This is independent of the option specifying what PCRE considers to
|
||||
be the end of a line (see above). However, the caller of PCRE can restrict \R
|
||||
to match only CR, LF, or CRLF. You can make this the default by adding
|
||||
--enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
|
||||
|
||||
. When called via the POSIX interface, PCRE uses malloc() to get additional
|
||||
storage for processing capturing parentheses if there are more than 10 of
|
||||
them. You can increase this threshold by setting, for example,
|
||||
them in a pattern. You can increase this threshold by setting, for example,
|
||||
|
||||
--with-posix-malloc-threshold=20
|
||||
|
||||
@@ -141,8 +217,8 @@ library. You can read more about them in the pcrebuild man page.
|
||||
--with-match-limit=500000
|
||||
|
||||
on the "configure" command. This is just the default; individual calls to
|
||||
pcre_exec() can supply their own value. There is discussion on the pcreapi
|
||||
man page.
|
||||
pcre_exec() can supply their own value. There is more discussion on the
|
||||
pcreapi man page.
|
||||
|
||||
. There is a separate counter that limits the depth of recursive function calls
|
||||
during a matching process. This also has a default of ten million, which is
|
||||
@@ -157,37 +233,92 @@ library. You can read more about them in the pcrebuild man page.
|
||||
. The default maximum compiled pattern size is around 64K. You can increase
|
||||
this by adding --with-link-size=3 to the "configure" command. You can
|
||||
increase it even more by setting --with-link-size=4, but this is unlikely
|
||||
ever to be necessary. If you build PCRE with an increased link size, test 2
|
||||
(and 5 if you are using UTF-8) will fail. Part of the output of these tests
|
||||
is a representation of the compiled pattern, and this changes with the link
|
||||
size.
|
||||
ever to be necessary. Increasing the internal link size will reduce
|
||||
performance.
|
||||
|
||||
. You can build PCRE so that its internal match() function that is called from
|
||||
pcre_exec() does not call itself recursively. Instead, it uses blocks of data
|
||||
from the heap via special functions pcre_stack_malloc() and pcre_stack_free()
|
||||
to save data that would otherwise be saved on the stack. To build PCRE like
|
||||
this, use
|
||||
pcre_exec() does not call itself recursively. Instead, it uses memory blocks
|
||||
obtained from the heap via the special functions pcre_stack_malloc() and
|
||||
pcre_stack_free() to save data that would otherwise be saved on the stack. To
|
||||
build PCRE like this, use
|
||||
|
||||
--disable-stack-for-recursion
|
||||
|
||||
on the "configure" command. PCRE runs more slowly in this mode, but it may be
|
||||
necessary in environments with limited stack sizes. This applies only to the
|
||||
pcre_exec() function; it does not apply to pcre_dfa_exec(), which does not
|
||||
use deeply nested recursion.
|
||||
use deeply nested recursion. There is a discussion about stack sizes in the
|
||||
pcrestack man page.
|
||||
|
||||
The "configure" script builds eight files for the basic C library:
|
||||
. For speed, PCRE uses four tables for manipulating and identifying characters
|
||||
whose code point values are less than 256. By default, it uses a set of
|
||||
tables for ASCII encoding that is part of the distribution. If you specify
|
||||
|
||||
--enable-rebuild-chartables
|
||||
|
||||
a program called dftables is compiled and run in the default C locale when
|
||||
you obey "make". It builds a source file called pcre_chartables.c. If you do
|
||||
not specify this option, pcre_chartables.c is created as a copy of
|
||||
pcre_chartables.c.dist. See "Character tables" below for further information.
|
||||
|
||||
. It is possible to compile PCRE for use on systems that use EBCDIC as their
|
||||
character code (as opposed to ASCII) by specifying
|
||||
|
||||
--enable-ebcdic
|
||||
|
||||
This automatically implies --enable-rebuild-chartables (see above). However,
|
||||
when PCRE is built this way, it always operates in EBCDIC. It cannot support
|
||||
both EBCDIC and UTF-8.
|
||||
|
||||
. It is possible to compile pcregrep to use libz and/or libbz2, in order to
|
||||
read .gz and .bz2 files (respectively), by specifying one or both of
|
||||
|
||||
--enable-pcregrep-libz
|
||||
--enable-pcregrep-libbz2
|
||||
|
||||
Of course, the relevant libraries must be installed on your system.
|
||||
|
||||
. It is possible to compile pcretest so that it links with the libreadline
|
||||
library, by specifying
|
||||
|
||||
--enable-pcretest-libreadline
|
||||
|
||||
If this is done, when pcretest's input is from a terminal, it reads it using
|
||||
the readline() function. This provides line-editing and history facilities.
|
||||
Note that libreadline is GPL-licenced, so if you distribute a binary of
|
||||
pcretest linked in this way, there may be licensing issues.
|
||||
|
||||
Setting this option causes the -lreadline option to be added to the pcretest
|
||||
build. In many operating environments with a sytem-installed readline
|
||||
library this is sufficient. However, in some environments (e.g. if an
|
||||
unmodified distribution version of readline is in use), it may be necessary
|
||||
to specify something like LIBS="-lncurses" as well. This is because, to quote
|
||||
the readline INSTALL, "Readline uses the termcap functions, but does not link
|
||||
with the termcap or curses library itself, allowing applications which link
|
||||
with readline the to choose an appropriate library." If you get error
|
||||
messages about missing functions tgetstr, tgetent, tputs, tgetflag, or tgoto,
|
||||
this is the problem, and linking with the ncurses library should fix it.
|
||||
|
||||
The "configure" script builds the following files for the basic C library:
|
||||
|
||||
. Makefile is the makefile that builds the library
|
||||
. config.h contains build-time configuration options for the library
|
||||
. pcre.h is the public PCRE header file
|
||||
. pcre-config is a script that shows the settings of "configure" options
|
||||
. libpcre.pc is data for the pkg-config command
|
||||
. libtool is a script that builds shared and/or static libraries
|
||||
. RunTest is a script for running tests on the library
|
||||
. RunTest is a script for running tests on the basic C library
|
||||
. RunGrepTest is a script for running tests on the pcregrep command
|
||||
|
||||
In addition, if a C++ compiler is found, the following are also built:
|
||||
Versions of config.h and pcre.h are distributed in the PCRE tarballs under
|
||||
the names config.h.generic and pcre.h.generic. These are provided for the
|
||||
benefit of those who have to built PCRE without the benefit of "configure". If
|
||||
you use "configure", the .generic versions are not used.
|
||||
|
||||
. pcrecpp.h is the header file for programs that call PCRE via the C++ wrapper
|
||||
If a C++ compiler is found, the following files are also built:
|
||||
|
||||
. libpcrecpp.pc is data for the pkg-config command
|
||||
. pcrecpparg.h is a header file for programs that call PCRE via the C++ wrapper
|
||||
. pcre_stringpiece.h is the header for the C++ "stringpiece" functions
|
||||
|
||||
The "configure" script also creates config.status, which is an executable
|
||||
@@ -196,17 +327,61 @@ contains compiler output from tests that "configure" runs.
|
||||
|
||||
Once "configure" has run, you can run "make". It builds two libraries, called
|
||||
libpcre and libpcreposix, a test program called pcretest, and the pcregrep
|
||||
command. If a C++ compiler was found on your system, it also builds the C++
|
||||
command. If a C++ compiler was found on your system, "make" also builds the C++
|
||||
wrapper library, which is called libpcrecpp, and some test programs called
|
||||
pcrecpp_unittest, pcre_scanner_unittest, and pcre_stringpiece_unittest.
|
||||
Building the C++ wrapper can be disabled by adding --disable-cpp to the
|
||||
"configure" command.
|
||||
|
||||
The command "make test" runs all the appropriate tests. Details of the PCRE
|
||||
tests are given in a separate section of this document, below.
|
||||
The command "make check" runs all the appropriate tests. Details of the PCRE
|
||||
tests are given below in a separate section of this document.
|
||||
|
||||
You can use "make install" to copy the libraries, the public header files
|
||||
pcre.h, pcreposix.h, pcrecpp.h, and pcre_stringpiece.h (the last two only if
|
||||
the C++ wrapper was built), and the man pages to appropriate live directories
|
||||
on your system, in the normal way.
|
||||
You can use "make install" to install PCRE into live directories on your
|
||||
system. The following are installed (file names are all relative to the
|
||||
<prefix> that is set when "configure" is run):
|
||||
|
||||
Commands (bin):
|
||||
pcretest
|
||||
pcregrep
|
||||
pcre-config
|
||||
|
||||
Libraries (lib):
|
||||
libpcre
|
||||
libpcreposix
|
||||
libpcrecpp (if C++ support is enabled)
|
||||
|
||||
Configuration information (lib/pkgconfig):
|
||||
libpcre.pc
|
||||
libpcrecpp.pc (if C++ support is enabled)
|
||||
|
||||
Header files (include):
|
||||
pcre.h
|
||||
pcreposix.h
|
||||
pcre_scanner.h )
|
||||
pcre_stringpiece.h ) if C++ support is enabled
|
||||
pcrecpp.h )
|
||||
pcrecpparg.h )
|
||||
|
||||
Man pages (share/man/man{1,3}):
|
||||
pcregrep.1
|
||||
pcretest.1
|
||||
pcre.3
|
||||
pcre*.3 (lots more pages, all starting "pcre")
|
||||
|
||||
HTML documentation (share/doc/pcre/html):
|
||||
index.html
|
||||
*.html (lots more pages, hyperlinked from index.html)
|
||||
|
||||
Text file documentation (share/doc/pcre):
|
||||
AUTHORS
|
||||
COPYING
|
||||
ChangeLog
|
||||
LICENCE
|
||||
NEWS
|
||||
README
|
||||
pcre.txt (a concatenation of the man(3) pages)
|
||||
pcretest.txt the pcretest man page
|
||||
pcregrep.txt the pcregrep man page
|
||||
|
||||
If you want to remove PCRE from your system, you can run "make uninstall".
|
||||
This removes all the files that "make install" installed. However, it does not
|
||||
@@ -216,9 +391,8 @@ remove any directories, because these are often shared with other programs.
|
||||
Retrieving configuration information on Unix-like systems
|
||||
---------------------------------------------------------
|
||||
|
||||
Running "make install" also installs the command pcre-config, which can be used
|
||||
to recall information about the PCRE configuration and installation. For
|
||||
example:
|
||||
Running "make install" installs the command pcre-config, which can be used to
|
||||
recall information about the PCRE configuration and installation. For example:
|
||||
|
||||
pcre-config --version
|
||||
|
||||
@@ -237,7 +411,7 @@ single command is used. For example:
|
||||
pkg-config --cflags pcre
|
||||
|
||||
The data is held in *.pc files that are installed in a directory called
|
||||
pkgconfig.
|
||||
<prefix>/lib/pkgconfig.
|
||||
|
||||
|
||||
Shared libraries on Unix-like systems
|
||||
@@ -254,7 +428,7 @@ built. The programs pcretest and pcregrep are built to use these uninstalled
|
||||
libraries (by means of wrapper scripts in the case of shared libraries). When
|
||||
you use "make install" to install shared libraries, pcregrep and pcretest are
|
||||
automatically re-built to use the newly installed shared libraries before being
|
||||
installed themselves. However, the versions left in the source directory still
|
||||
installed themselves. However, the versions left in the build directory still
|
||||
use the uninstalled libraries.
|
||||
|
||||
To build PCRE using static libraries only you must use --disable-shared when
|
||||
@@ -266,25 +440,33 @@ Then run "make" in the usual way. Similarly, you can use --disable-static to
|
||||
build only shared libraries.
|
||||
|
||||
|
||||
Cross-compiling on a Unix-like system
|
||||
-------------------------------------
|
||||
Cross-compiling on Unix-like systems
|
||||
------------------------------------
|
||||
|
||||
You can specify CC and CFLAGS in the normal way to the "configure" command, in
|
||||
order to cross-compile PCRE for some other host. However, during the building
|
||||
process, the dftables.c source file is compiled *and run* on the local host, in
|
||||
order to generate the default character tables (the chartables.c file). It
|
||||
therefore needs to be compiled with the local compiler, not the cross compiler.
|
||||
You can do this by specifying CC_FOR_BUILD (and if necessary CFLAGS_FOR_BUILD;
|
||||
there are also CXX_FOR_BUILD and CXXFLAGS_FOR_BUILD for the C++ wrapper)
|
||||
when calling the "configure" command. If they are not specified, they default
|
||||
to the values of CC and CFLAGS.
|
||||
order to cross-compile PCRE for some other host. However, you should NOT
|
||||
specify --enable-rebuild-chartables, because if you do, the dftables.c source
|
||||
file is compiled and run on the local host, in order to generate the inbuilt
|
||||
character tables (the pcre_chartables.c file). This will probably not work,
|
||||
because dftables.c needs to be compiled with the local compiler, not the cross
|
||||
compiler.
|
||||
|
||||
When --enable-rebuild-chartables is not specified, pcre_chartables.c is created
|
||||
by making a copy of pcre_chartables.c.dist, which is a default set of tables
|
||||
that assumes ASCII code. Cross-compiling with the default tables should not be
|
||||
a problem.
|
||||
|
||||
If you need to modify the character tables when cross-compiling, you should
|
||||
move pcre_chartables.c.dist out of the way, then compile dftables.c by hand and
|
||||
run it on the local host to make a new version of pcre_chartables.c.dist.
|
||||
Then when you cross-compile PCRE this new version of the tables will be used.
|
||||
|
||||
|
||||
Using HP's ANSI C++ compiler (aCC)
|
||||
----------------------------------
|
||||
|
||||
Unless C++ support is disabled by specifiying the "--disable-cpp" option of the
|
||||
"configure" script, you *must* include the "-AA" option in the CXXFLAGS
|
||||
Unless C++ support is disabled by specifying the "--disable-cpp" option of the
|
||||
"configure" script, you must include the "-AA" option in the CXXFLAGS
|
||||
environment variable in order for the C++ components to compile correctly.
|
||||
|
||||
Also, note that the aCC compiler on PA-RISC platforms may have a defect whereby
|
||||
@@ -296,49 +478,48 @@ running the "configure" script:
|
||||
CXXLDFLAGS="-lstd_v2 -lCsup_v2"
|
||||
|
||||
|
||||
Building on non-Unix systems
|
||||
----------------------------
|
||||
Making new tarballs
|
||||
-------------------
|
||||
|
||||
For a non-Unix system, read the comments in the file NON-UNIX-USE, though if
|
||||
the system supports the use of "configure" and "make" you may be able to build
|
||||
PCRE in the same way as for Unix systems.
|
||||
The command "make dist" creates three PCRE tarballs, in tar.gz, tar.bz2, and
|
||||
zip formats. The command "make distcheck" does the same, but then does a trial
|
||||
build of the new distribution to ensure that it works.
|
||||
|
||||
PCRE has been compiled on Windows systems and on Macintoshes, but I don't know
|
||||
the details because I don't use those systems. It should be straightforward to
|
||||
build PCRE on any system that has a Standard C compiler, because it uses only
|
||||
Standard C functions.
|
||||
If you have modified any of the man page sources in the doc directory, you
|
||||
should first run the PrepareRelease script before making a distribution. This
|
||||
script creates the .txt and HTML forms of the documentation from the man pages.
|
||||
|
||||
|
||||
Testing PCRE
|
||||
------------
|
||||
|
||||
To test PCRE on a Unix system, run the RunTest script that is created by the
|
||||
configuring process. There is also a script called RunGrepTest that tests the
|
||||
options of the pcregrep command. If the C++ wrapper library is build, three
|
||||
test programs called pcrecpp_unittest, pcre_scanner_unittest, and
|
||||
pcre_stringpiece_unittest are provided.
|
||||
To test the basic PCRE library on a Unix system, run the RunTest script that is
|
||||
created by the configuring process. There is also a script called RunGrepTest
|
||||
that tests the options of the pcregrep command. If the C++ wrapper library is
|
||||
built, three test programs called pcrecpp_unittest, pcre_scanner_unittest, and
|
||||
pcre_stringpiece_unittest are also built.
|
||||
|
||||
Both the scripts and all the program tests are run if you obey "make runtest",
|
||||
"make check", or "make test". For other systems, see the instructions in
|
||||
NON-UNIX-USE.
|
||||
Both the scripts and all the program tests are run if you obey "make check" or
|
||||
"make test". For other systems, see the instructions in NON-UNIX-USE.
|
||||
|
||||
The RunTest script runs the pcretest test program (which is documented in its
|
||||
own man page) on each of the testinput files (in the testdata directory) in
|
||||
own man page) on each of the testinput files in the testdata directory in
|
||||
turn, and compares the output with the contents of the corresponding testoutput
|
||||
file. A file called testtry is used to hold the main output from pcretest
|
||||
files. A file called testtry is used to hold the main output from pcretest
|
||||
(testsavedregex is also used as a working file). To run pcretest on just one of
|
||||
the test files, give its number as an argument to RunTest, for example:
|
||||
|
||||
RunTest 2
|
||||
|
||||
The first file can also be fed directly into the perltest script to check that
|
||||
Perl gives the same results. The only difference you should see is in the first
|
||||
few lines, where the Perl version is given instead of the PCRE version.
|
||||
The first test file can also be fed directly into the perltest.pl script to
|
||||
check that Perl gives the same results. The only difference you should see is
|
||||
in the first few lines, where the Perl version is given instead of the PCRE
|
||||
version.
|
||||
|
||||
The second set of tests check pcre_fullinfo(), pcre_info(), pcre_study(),
|
||||
pcre_copy_substring(), pcre_get_substring(), pcre_get_substring_list(), error
|
||||
detection, and run-time flags that are specific to PCRE, as well as the POSIX
|
||||
wrapper API. It also uses the debugging flag to check some of the internals of
|
||||
wrapper API. It also uses the debugging flags to check some of the internals of
|
||||
pcre_compile().
|
||||
|
||||
If you build PCRE with a locale setting that is not the standard C locale, the
|
||||
@@ -364,6 +545,12 @@ is output to say why. If running this test produces instances of the error
|
||||
in the comparison output, it means that locale is not available on your system,
|
||||
despite being listed by "locale". This does not mean that PCRE is broken.
|
||||
|
||||
[If you are trying to run this test on Windows, you may be able to get it to
|
||||
work by changing "fr_FR" to "french" everywhere it occurs. Alternatively, use
|
||||
RunTest.bat. The version of RunTest.bat included with PCRE 7.4 and above uses
|
||||
Windows versions of test 2. More info on using RunTest.bat is included in the
|
||||
document entitled NON-UNIX-USE.]
|
||||
|
||||
The fourth test checks the UTF-8 support. It is not run automatically unless
|
||||
PCRE is built with UTF-8 support. To do this you must set --enable-utf8 when
|
||||
running "configure". This file can be also fed directly to the perltest script,
|
||||
@@ -373,8 +560,8 @@ commented in the script, can be be used.)
|
||||
The fifth test checks error handling with UTF-8 encoding, and internal UTF-8
|
||||
features of PCRE that are not relevant to Perl.
|
||||
|
||||
The sixth and test checks the support for Unicode character properties. It it
|
||||
not run automatically unless PCRE is built with Unicode property support. To to
|
||||
The sixth test checks the support for Unicode character properties. It it not
|
||||
run automatically unless PCRE is built with Unicode property support. To to
|
||||
this you must set --enable-unicode-properties when running "configure".
|
||||
|
||||
The seventh, eighth, and ninth tests check the pcre_dfa_exec() alternative
|
||||
@@ -386,27 +573,42 @@ automatically unless PCRE is build with the relevant support.
|
||||
Character tables
|
||||
----------------
|
||||
|
||||
PCRE uses four tables for manipulating and identifying characters whose values
|
||||
are less than 256. The final argument of the pcre_compile() function is a
|
||||
pointer to a block of memory containing the concatenated tables. A call to
|
||||
pcre_maketables() can be used to generate a set of tables in the current
|
||||
locale. If the final argument for pcre_compile() is passed as NULL, a set of
|
||||
default tables that is built into the binary is used.
|
||||
For speed, PCRE uses four tables for manipulating and identifying characters
|
||||
whose code point values are less than 256. The final argument of the
|
||||
pcre_compile() function is a pointer to a block of memory containing the
|
||||
concatenated tables. A call to pcre_maketables() can be used to generate a set
|
||||
of tables in the current locale. If the final argument for pcre_compile() is
|
||||
passed as NULL, a set of default tables that is built into the binary is used.
|
||||
|
||||
The source file called chartables.c contains the default set of tables. This is
|
||||
not supplied in the distribution, but is built by the program dftables
|
||||
(compiled from dftables.c), which uses the ANSI C character handling functions
|
||||
such as isalnum(), isalpha(), isupper(), islower(), etc. to build the table
|
||||
sources. This means that the default C locale which is set for your system will
|
||||
control the contents of these default tables. You can change the default tables
|
||||
by editing chartables.c and then re-building PCRE. If you do this, you should
|
||||
probably also edit Makefile to ensure that the file doesn't ever get
|
||||
re-generated.
|
||||
The source file called pcre_chartables.c contains the default set of tables. By
|
||||
default, this is created as a copy of pcre_chartables.c.dist, which contains
|
||||
tables for ASCII coding. However, if --enable-rebuild-chartables is specified
|
||||
for ./configure, a different version of pcre_chartables.c is built by the
|
||||
program dftables (compiled from dftables.c), which uses the ANSI C character
|
||||
handling functions such as isalnum(), isalpha(), isupper(), islower(), etc. to
|
||||
build the table sources. This means that the default C locale which is set for
|
||||
your system will control the contents of these default tables. You can change
|
||||
the default tables by editing pcre_chartables.c and then re-building PCRE. If
|
||||
you do this, you should take care to ensure that the file does not get
|
||||
automatically re-generated. The best way to do this is to move
|
||||
pcre_chartables.c.dist out of the way and replace it with your customized
|
||||
tables.
|
||||
|
||||
When the dftables program is run as a result of --enable-rebuild-chartables,
|
||||
it uses the default C locale that is set on your system. It does not pay
|
||||
attention to the LC_xxx environment variables. In other words, it uses the
|
||||
system's default locale rather than whatever the compiling user happens to have
|
||||
set. If you really do want to build a source set of character tables in a
|
||||
locale that is specified by the LC_xxx variables, you can run the dftables
|
||||
program by hand with the -L option. For example:
|
||||
|
||||
./dftables -L pcre_chartables.c.special
|
||||
|
||||
The first two 256-byte tables provide lower casing and case flipping functions,
|
||||
respectively. The next table consists of three 32-byte bit maps which identify
|
||||
digits, "word" characters, and white space, respectively. These are used when
|
||||
building 32-byte bit maps that represent character classes.
|
||||
building 32-byte bit maps that represent character classes for code points less
|
||||
than 256.
|
||||
|
||||
The final 256-byte table has bits indicating various character types, as
|
||||
follows:
|
||||
@@ -422,107 +624,144 @@ You should not alter the set of characters that contain the 128 bit, as that
|
||||
will cause PCRE to malfunction.
|
||||
|
||||
|
||||
Manifest
|
||||
--------
|
||||
File manifest
|
||||
-------------
|
||||
|
||||
The distribution should contain the following files:
|
||||
|
||||
(A) The actual source files of the PCRE library functions and their
|
||||
headers:
|
||||
(A) Source files of the PCRE library functions and their headers:
|
||||
|
||||
dftables.c auxiliary program for building chartables.c
|
||||
dftables.c auxiliary program for building pcre_chartables.c
|
||||
when --enable-rebuild-chartables is specified
|
||||
|
||||
pcreposix.c )
|
||||
pcre_compile.c )
|
||||
pcre_config.c )
|
||||
pcre_dfa_exec.c )
|
||||
pcre_exec.c )
|
||||
pcre_fullinfo.c )
|
||||
pcre_get.c ) sources for the functions in the library,
|
||||
pcre_globals.c ) and some internal functions that they use
|
||||
pcre_info.c )
|
||||
pcre_maketables.c )
|
||||
pcre_ord2utf8.c )
|
||||
pcre_refcount.c )
|
||||
pcre_study.c )
|
||||
pcre_tables.c )
|
||||
pcre_try_flipped.c )
|
||||
pcre_ucp_searchfuncs.c)
|
||||
pcre_valid_utf8.c )
|
||||
pcre_version.c )
|
||||
pcre_xclass.c )
|
||||
ucptable.c )
|
||||
pcre_chartables.c.dist a default set of character tables that assume ASCII
|
||||
coding; used, unless --enable-rebuild-chartables is
|
||||
specified, by copying to pcre_chartables.c
|
||||
|
||||
pcre_printint.src ) debugging function that is #included in pcretest, and
|
||||
) can also be #included in pcre_compile()
|
||||
pcreposix.c )
|
||||
pcre_compile.c )
|
||||
pcre_config.c )
|
||||
pcre_dfa_exec.c )
|
||||
pcre_exec.c )
|
||||
pcre_fullinfo.c )
|
||||
pcre_get.c ) sources for the functions in the library,
|
||||
pcre_globals.c ) and some internal functions that they use
|
||||
pcre_info.c )
|
||||
pcre_maketables.c )
|
||||
pcre_newline.c )
|
||||
pcre_ord2utf8.c )
|
||||
pcre_refcount.c )
|
||||
pcre_study.c )
|
||||
pcre_tables.c )
|
||||
pcre_try_flipped.c )
|
||||
pcre_ucd.c )
|
||||
pcre_valid_utf8.c )
|
||||
pcre_version.c )
|
||||
pcre_xclass.c )
|
||||
pcre_printint.src ) debugging function that is #included in pcretest,
|
||||
) and can also be #included in pcre_compile()
|
||||
pcre.h.in template for pcre.h when built by "configure"
|
||||
pcreposix.h header for the external POSIX wrapper API
|
||||
pcre_internal.h header for internal use
|
||||
ucp.h header for Unicode property handling
|
||||
|
||||
pcre.h the public PCRE header file
|
||||
pcreposix.h header for the external POSIX wrapper API
|
||||
pcre_internal.h header for internal use
|
||||
ucp.h ) headers concerned with
|
||||
ucpinternal.h ) Unicode property handling
|
||||
config.in template for config.h, which is built by configure
|
||||
config.h.in template for config.h, which is built by "configure"
|
||||
|
||||
pcrecpp.h the header file for the C++ wrapper
|
||||
pcrecpparg.h.in "source" for another C++ header file
|
||||
pcrecpp.cc )
|
||||
pcre_scanner.cc ) source for the C++ wrapper library
|
||||
pcrecpp.h public header file for the C++ wrapper
|
||||
pcrecpparg.h.in template for another C++ header file
|
||||
pcre_scanner.h public header file for C++ scanner functions
|
||||
pcrecpp.cc )
|
||||
pcre_scanner.cc ) source for the C++ wrapper library
|
||||
|
||||
pcre_stringpiece.h.in "source" for pcre_stringpiece.h, the header for the
|
||||
C++ stringpiece functions
|
||||
pcre_stringpiece.cc source for the C++ stringpiece functions
|
||||
pcre_stringpiece.h.in template for pcre_stringpiece.h, the header for the
|
||||
C++ stringpiece functions
|
||||
pcre_stringpiece.cc source for the C++ stringpiece functions
|
||||
|
||||
(B) Auxiliary files:
|
||||
(B) Source files for programs that use PCRE:
|
||||
|
||||
AUTHORS information about the author of PCRE
|
||||
ChangeLog log of changes to the code
|
||||
INSTALL generic installation instructions
|
||||
LICENCE conditions for the use of PCRE
|
||||
COPYING the same, using GNU's standard name
|
||||
Makefile.in template for Unix Makefile, which is built by configure
|
||||
NEWS important changes in this release
|
||||
NON-UNIX-USE notes on building PCRE on non-Unix systems
|
||||
README this file
|
||||
RunTest.in template for a Unix shell script for running tests
|
||||
RunGrepTest.in template for a Unix shell script for pcregrep tests
|
||||
config.guess ) files used by libtool,
|
||||
config.sub ) used only when building a shared library
|
||||
config.h.in "source" for the config.h header file
|
||||
configure a configuring shell script (built by autoconf)
|
||||
configure.ac the autoconf input used to build configure
|
||||
doc/Tech.Notes notes on the encoding
|
||||
doc/*.3 man page sources for the PCRE functions
|
||||
doc/*.1 man page sources for pcregrep and pcretest
|
||||
doc/html/* HTML documentation
|
||||
doc/pcre.txt plain text version of the man pages
|
||||
doc/pcretest.txt plain text documentation of test program
|
||||
doc/perltest.txt plain text documentation of Perl test program
|
||||
install-sh a shell script for installing files
|
||||
libpcre.pc.in "source" for libpcre.pc for pkg-config
|
||||
ltmain.sh file used to build a libtool script
|
||||
mkinstalldirs script for making install directories
|
||||
pcretest.c comprehensive test program
|
||||
pcredemo.c simple demonstration of coding calls to PCRE
|
||||
perltest Perl test program
|
||||
pcregrep.c source of a grep utility that uses PCRE
|
||||
pcre-config.in source of script which retains PCRE information
|
||||
pcrecpp_unittest.c )
|
||||
pcre_scanner_unittest.c ) test programs for the C++ wrapper
|
||||
pcre_stringpiece_unittest.c )
|
||||
testdata/testinput* test data for main library tests
|
||||
testdata/testoutput* expected test results
|
||||
testdata/grep* input and output for pcregrep tests
|
||||
pcredemo.c simple demonstration of coding calls to PCRE
|
||||
pcregrep.c source of a grep utility that uses PCRE
|
||||
pcretest.c comprehensive test program
|
||||
|
||||
(C) Auxiliary files for Win32 DLL
|
||||
(C) Auxiliary files:
|
||||
|
||||
libpcre.def
|
||||
libpcreposix.def
|
||||
132html script to turn "man" pages into HTML
|
||||
AUTHORS information about the author of PCRE
|
||||
ChangeLog log of changes to the code
|
||||
CleanTxt script to clean nroff output for txt man pages
|
||||
Detrail script to remove trailing spaces
|
||||
HACKING some notes about the internals of PCRE
|
||||
INSTALL generic installation instructions
|
||||
LICENCE conditions for the use of PCRE
|
||||
COPYING the same, using GNU's standard name
|
||||
Makefile.in ) template for Unix Makefile, which is built by
|
||||
) "configure"
|
||||
Makefile.am ) the automake input that was used to create
|
||||
) Makefile.in
|
||||
NEWS important changes in this release
|
||||
NON-UNIX-USE notes on building PCRE on non-Unix systems
|
||||
PrepareRelease script to make preparations for "make dist"
|
||||
README this file
|
||||
RunTest a Unix shell script for running tests
|
||||
RunGrepTest a Unix shell script for pcregrep tests
|
||||
aclocal.m4 m4 macros (generated by "aclocal")
|
||||
config.guess ) files used by libtool,
|
||||
config.sub ) used only when building a shared library
|
||||
configure a configuring shell script (built by autoconf)
|
||||
configure.ac ) the autoconf input that was used to build
|
||||
) "configure" and config.h
|
||||
depcomp ) script to find program dependencies, generated by
|
||||
) automake
|
||||
doc/*.3 man page sources for the PCRE functions
|
||||
doc/*.1 man page sources for pcregrep and pcretest
|
||||
doc/index.html.src the base HTML page
|
||||
doc/html/* HTML documentation
|
||||
doc/pcre.txt plain text version of the man pages
|
||||
doc/pcretest.txt plain text documentation of test program
|
||||
doc/perltest.txt plain text documentation of Perl test program
|
||||
install-sh a shell script for installing files
|
||||
libpcre.pc.in template for libpcre.pc for pkg-config
|
||||
libpcrecpp.pc.in template for libpcrecpp.pc for pkg-config
|
||||
ltmain.sh file used to build a libtool script
|
||||
missing ) common stub for a few missing GNU programs while
|
||||
) installing, generated by automake
|
||||
mkinstalldirs script for making install directories
|
||||
perltest.pl Perl test program
|
||||
pcre-config.in source of script which retains PCRE information
|
||||
pcrecpp_unittest.cc )
|
||||
pcre_scanner_unittest.cc ) test programs for the C++ wrapper
|
||||
pcre_stringpiece_unittest.cc )
|
||||
testdata/testinput* test data for main library tests
|
||||
testdata/testoutput* expected test results
|
||||
testdata/grep* input and output for pcregrep tests
|
||||
|
||||
(D) Auxiliary file for VPASCAL
|
||||
(D) Auxiliary files for cmake support
|
||||
|
||||
cmake/COPYING-CMAKE-SCRIPTS
|
||||
cmake/FindPackageHandleStandardArgs.cmake
|
||||
cmake/FindReadline.cmake
|
||||
CMakeLists.txt
|
||||
config-cmake.h.in
|
||||
|
||||
(E) Auxiliary files for VPASCAL
|
||||
|
||||
makevp.bat
|
||||
makevp_c.txt
|
||||
makevp_l.txt
|
||||
pcregexp.pas
|
||||
|
||||
(F) Auxiliary files for building PCRE "by hand"
|
||||
|
||||
pcre.h.generic ) a version of the public PCRE header file
|
||||
) for use in non-"configure" environments
|
||||
config.h.generic ) a version of config.h for use in non-"configure"
|
||||
) environments
|
||||
|
||||
(F) Miscellaneous
|
||||
|
||||
RunTest.bat a script for running tests under Windows
|
||||
|
||||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
June 2006
|
||||
Last updated: 21 March 2009
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
@rem This file was contributed by Ralf Junker, and touched up by
|
||||
@rem Daniel Richard G. Test 10 added by Philip H.
|
||||
@rem Philip H also changed test 3 to use "wintest" files.
|
||||
@rem
|
||||
@rem MS Windows batch file to run pcretest on testfiles with the correct
|
||||
@rem options.
|
||||
@rem
|
||||
@rem Output is written to a newly created subfolder named "testdata".
|
||||
|
||||
setlocal
|
||||
|
||||
if [%srcdir%]==[] set srcdir=.
|
||||
if [%pcretest%]==[] set pcretest=pcretest
|
||||
|
||||
if not exist testout md testout
|
||||
|
||||
%pcretest% -q %srcdir%\testdata\testinput1 > testout\testoutput1
|
||||
%pcretest% -q %srcdir%\testdata\testinput2 > testout\testoutput2
|
||||
@rem %pcretest% -q %srcdir%\testdata\testinput3 > testout\testoutput3
|
||||
%pcretest% -q %srcdir%\testdata\wintestinput3 > testout\wintestoutput3
|
||||
%pcretest% -q %srcdir%\testdata\testinput4 > testout\testoutput4
|
||||
%pcretest% -q %srcdir%\testdata\testinput5 > testout\testoutput5
|
||||
%pcretest% -q %srcdir%\testdata\testinput6 > testout\testoutput6
|
||||
%pcretest% -q -dfa %srcdir%\testdata\testinput7 > testout\testoutput7
|
||||
%pcretest% -q -dfa %srcdir%\testdata\testinput8 > testout\testoutput8
|
||||
%pcretest% -q -dfa %srcdir%\testdata\testinput9 > testout\testoutput9
|
||||
%pcretest% -q %srcdir%\testdata\testinput10 > testout\testoutput10
|
||||
|
||||
fc /n %srcdir%\testdata\testoutput1 testout\testoutput1
|
||||
fc /n %srcdir%\testdata\testoutput2 testout\testoutput2
|
||||
rem fc /n %srcdir%\testdata\testoutput3 testout\testoutput3
|
||||
fc /n %srcdir%\testdata\wintestoutput3 testout\wintestoutput3
|
||||
fc /n %srcdir%\testdata\testoutput4 testout\testoutput4
|
||||
fc /n %srcdir%\testdata\testoutput5 testout\testoutput5
|
||||
fc /n %srcdir%\testdata\testoutput6 testout\testoutput6
|
||||
fc /n %srcdir%\testdata\testoutput7 testout\testoutput7
|
||||
fc /n %srcdir%\testdata\testoutput8 testout\testoutput8
|
||||
fc /n %srcdir%\testdata\testoutput9 testout\testoutput9
|
||||
fc /n %srcdir%\testdata\testoutput10 testout\testoutput10
|
||||
@@ -0,0 +1,22 @@
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
1. Redistributions of source code must retain the copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
3. The name of the author may not be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
@@ -0,0 +1,58 @@
|
||||
# FIND_PACKAGE_HANDLE_STANDARD_ARGS(NAME (DEFAULT_MSG|"Custom failure message") VAR1 ... )
|
||||
# This macro is intended to be used in FindXXX.cmake modules files.
|
||||
# It handles the REQUIRED and QUIET argument to FIND_PACKAGE() and
|
||||
# it also sets the <UPPERCASED_NAME>_FOUND variable.
|
||||
# The package is found if all variables listed are TRUE.
|
||||
# Example:
|
||||
#
|
||||
# FIND_PACKAGE_HANDLE_STANDARD_ARGS(LibXml2 DEFAULT_MSG LIBXML2_LIBRARIES LIBXML2_INCLUDE_DIR)
|
||||
#
|
||||
# LibXml2 is considered to be found, if both LIBXML2_LIBRARIES and
|
||||
# LIBXML2_INCLUDE_DIR are valid. Then also LIBXML2_FOUND is set to TRUE.
|
||||
# If it is not found and REQUIRED was used, it fails with FATAL_ERROR,
|
||||
# independent whether QUIET was used or not.
|
||||
# If it is found, the location is reported using the VAR1 argument, so
|
||||
# here a message "Found LibXml2: /usr/lib/libxml2.so" will be printed out.
|
||||
# If the second argument is DEFAULT_MSG, the message in the failure case will
|
||||
# be "Could NOT find LibXml2", if you don't like this message you can specify
|
||||
# your own custom failure message there.
|
||||
|
||||
MACRO(FIND_PACKAGE_HANDLE_STANDARD_ARGS _NAME _FAIL_MSG _VAR1 )
|
||||
|
||||
IF("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG")
|
||||
IF (${_NAME}_FIND_REQUIRED)
|
||||
SET(_FAIL_MESSAGE "Could not find REQUIRED package ${_NAME}")
|
||||
ELSE (${_NAME}_FIND_REQUIRED)
|
||||
SET(_FAIL_MESSAGE "Could not find OPTIONAL package ${_NAME}")
|
||||
ENDIF (${_NAME}_FIND_REQUIRED)
|
||||
ELSE("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG")
|
||||
SET(_FAIL_MESSAGE "${_FAIL_MSG}")
|
||||
ENDIF("${_FAIL_MSG}" STREQUAL "DEFAULT_MSG")
|
||||
|
||||
STRING(TOUPPER ${_NAME} _NAME_UPPER)
|
||||
|
||||
SET(${_NAME_UPPER}_FOUND TRUE)
|
||||
IF(NOT ${_VAR1})
|
||||
SET(${_NAME_UPPER}_FOUND FALSE)
|
||||
ENDIF(NOT ${_VAR1})
|
||||
|
||||
FOREACH(_CURRENT_VAR ${ARGN})
|
||||
IF(NOT ${_CURRENT_VAR})
|
||||
SET(${_NAME_UPPER}_FOUND FALSE)
|
||||
ENDIF(NOT ${_CURRENT_VAR})
|
||||
ENDFOREACH(_CURRENT_VAR)
|
||||
|
||||
IF (${_NAME_UPPER}_FOUND)
|
||||
IF (NOT ${_NAME}_FIND_QUIETLY)
|
||||
MESSAGE(STATUS "Found ${_NAME}: ${${_VAR1}}")
|
||||
ENDIF (NOT ${_NAME}_FIND_QUIETLY)
|
||||
ELSE (${_NAME_UPPER}_FOUND)
|
||||
IF (${_NAME}_FIND_REQUIRED)
|
||||
MESSAGE(FATAL_ERROR "${_FAIL_MESSAGE}")
|
||||
ELSE (${_NAME}_FIND_REQUIRED)
|
||||
IF (NOT ${_NAME}_FIND_QUIETLY)
|
||||
MESSAGE(STATUS "${_FAIL_MESSAGE}")
|
||||
ENDIF (NOT ${_NAME}_FIND_QUIETLY)
|
||||
ENDIF (${_NAME}_FIND_REQUIRED)
|
||||
ENDIF (${_NAME_UPPER}_FOUND)
|
||||
ENDMACRO(FIND_PACKAGE_HANDLE_STANDARD_ARGS)
|
||||
@@ -0,0 +1,29 @@
|
||||
# from http://websvn.kde.org/trunk/KDE/kdeedu/cmake/modules/FindReadline.cmake
|
||||
# http://websvn.kde.org/trunk/KDE/kdeedu/cmake/modules/COPYING-CMAKE-SCRIPTS
|
||||
# --> BSD licensed
|
||||
#
|
||||
# GNU Readline library finder
|
||||
if(READLINE_INCLUDE_DIR AND READLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
set(READLINE_FOUND TRUE)
|
||||
else(READLINE_INCLUDE_DIR AND READLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
FIND_PATH(READLINE_INCLUDE_DIR readline/readline.h
|
||||
/usr/include/readline
|
||||
)
|
||||
|
||||
# 2008-04-22 The next clause used to read like this:
|
||||
#
|
||||
# FIND_LIBRARY(READLINE_LIBRARY NAMES readline)
|
||||
# FIND_LIBRARY(NCURSES_LIBRARY NAMES ncurses )
|
||||
# include(FindPackageHandleStandardArgs)
|
||||
# FIND_PACKAGE_HANDLE_STANDARD_ARGS(Readline DEFAULT_MSG NCURSES_LIBRARY READLINE_INCLUDE_DIR READLINE_LIBRARY )
|
||||
#
|
||||
# I was advised to modify it such that it will find an ncurses library if
|
||||
# required, but not if one was explicitly given, that is, it allows the
|
||||
# default to be overridden. PH
|
||||
|
||||
FIND_LIBRARY(READLINE_LIBRARY NAMES readline)
|
||||
include(FindPackageHandleStandardArgs)
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(Readline DEFAULT_MSG READLINE_INCLUDE_DIR READLINE_LIBRARY )
|
||||
|
||||
MARK_AS_ADVANCED(READLINE_INCLUDE_DIR READLINE_LIBRARY)
|
||||
endif(READLINE_INCLUDE_DIR AND READLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
@@ -0,0 +1,44 @@
|
||||
/* config.h for CMake builds */
|
||||
|
||||
#cmakedefine HAVE_DIRENT_H 1
|
||||
#cmakedefine HAVE_SYS_STAT_H 1
|
||||
#cmakedefine HAVE_SYS_TYPES_H 1
|
||||
#cmakedefine HAVE_UNISTD_H 1
|
||||
#cmakedefine HAVE_WINDOWS_H 1
|
||||
|
||||
#cmakedefine HAVE_TYPE_TRAITS_H 1
|
||||
#cmakedefine HAVE_BITS_TYPE_TRAITS_H 1
|
||||
|
||||
#cmakedefine HAVE_BCOPY 1
|
||||
#cmakedefine HAVE_MEMMOVE 1
|
||||
#cmakedefine HAVE_STRERROR 1
|
||||
#cmakedefine HAVE_STRTOLL 1
|
||||
#cmakedefine HAVE_STRTOQ 1
|
||||
#cmakedefine HAVE__STRTOI64 1
|
||||
|
||||
#cmakedefine PCRE_STATIC 1
|
||||
|
||||
#cmakedefine SUPPORT_UTF8 1
|
||||
#cmakedefine SUPPORT_UCP 1
|
||||
#cmakedefine EBCDIC 1
|
||||
#cmakedefine BSR_ANYCRLF 1
|
||||
#cmakedefine NO_RECURSE 1
|
||||
|
||||
#cmakedefine HAVE_LONG_LONG 1
|
||||
#cmakedefine HAVE_UNSIGNED_LONG_LONG 1
|
||||
|
||||
#cmakedefine SUPPORT_LIBBZ2 1
|
||||
#cmakedefine SUPPORT_LIBZ 1
|
||||
#cmakedefine SUPPORT_LIBREADLINE 1
|
||||
|
||||
#define NEWLINE @NEWLINE@
|
||||
#define POSIX_MALLOC_THRESHOLD @PCRE_POSIX_MALLOC_THRESHOLD@
|
||||
#define LINK_SIZE @PCRE_LINK_SIZE@
|
||||
#define MATCH_LIMIT @PCRE_MATCH_LIMIT@
|
||||
#define MATCH_LIMIT_RECURSION @PCRE_MATCH_LIMIT_RECURSION@
|
||||
|
||||
|
||||
#define MAX_NAME_SIZE 32
|
||||
#define MAX_NAME_COUNT 10000
|
||||
|
||||
/* end config.h for CMake builds */
|
||||
@@ -0,0 +1,313 @@
|
||||
/* config.h. Generated from config.h.in by configure. */
|
||||
/* config.h.in. Generated from configure.ac by autoheader. */
|
||||
|
||||
|
||||
/* On Unix-like systems config.h.in is converted by "configure" into config.h.
|
||||
Some other environments also support the use of "configure". PCRE is written in
|
||||
Standard C, but there are a few non-standard things it can cope with, allowing
|
||||
it to run on SunOS4 and other "close to standard" systems.
|
||||
|
||||
If you are going to build PCRE "by hand" on a system without "configure" you
|
||||
should copy the distributed config.h.generic to config.h, and then set up the
|
||||
macro definitions the way you need them. You must then add -DHAVE_CONFIG_H to
|
||||
all of your compile commands, so that config.h is included at the start of
|
||||
every source.
|
||||
|
||||
Alternatively, you can avoid editing by using -D on the compiler command line
|
||||
to set the macro values. In this case, you do not have to set -DHAVE_CONFIG_H.
|
||||
|
||||
PCRE uses memmove() if HAVE_MEMMOVE is set to 1; otherwise it uses bcopy() if
|
||||
HAVE_BCOPY is set to 1. If your system has neither bcopy() nor memmove(), set
|
||||
them both to 0; an emulation function will be used. */
|
||||
|
||||
/* By default, the \R escape sequence matches any Unicode line ending
|
||||
character or sequence of characters. If BSR_ANYCRLF is defined, this is
|
||||
changed so that backslash-R matches only CR, LF, or CRLF. The build- time
|
||||
default can be overridden by the user of PCRE at runtime. On systems that
|
||||
support it, "configure" can be used to override the default. */
|
||||
/* #undef BSR_ANYCRLF */
|
||||
|
||||
/* If you are compiling for a system that uses EBCDIC instead of ASCII
|
||||
character codes, define this macro as 1. On systems that can use
|
||||
"configure", this can be done via --enable-ebcdic. PCRE will then assume
|
||||
that all input strings are in EBCDIC. If you do not define this macro, PCRE
|
||||
will assume input strings are ASCII or UTF-8 Unicode. It is not possible to
|
||||
build a version of PCRE that supports both EBCDIC and UTF-8. */
|
||||
/* #undef EBCDIC */
|
||||
|
||||
/* Define to 1 if you have the `bcopy' function. */
|
||||
#ifndef HAVE_BCOPY
|
||||
#define HAVE_BCOPY 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <bits/type_traits.h> header file. */
|
||||
/* #undef HAVE_BITS_TYPE_TRAITS_H */
|
||||
|
||||
/* Define to 1 if you have the <bzlib.h> header file. */
|
||||
#ifndef HAVE_BZLIB_H
|
||||
#define HAVE_BZLIB_H 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <dirent.h> header file. */
|
||||
#ifndef HAVE_DIRENT_H
|
||||
#define HAVE_DIRENT_H 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <dlfcn.h> header file. */
|
||||
#ifndef HAVE_DLFCN_H
|
||||
#define HAVE_DLFCN_H 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <inttypes.h> header file. */
|
||||
#ifndef HAVE_INTTYPES_H
|
||||
#define HAVE_INTTYPES_H 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <limits.h> header file. */
|
||||
#ifndef HAVE_LIMITS_H
|
||||
#define HAVE_LIMITS_H 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if the system has the type `long long'. */
|
||||
#ifndef HAVE_LONG_LONG
|
||||
#define HAVE_LONG_LONG 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the `memmove' function. */
|
||||
#ifndef HAVE_MEMMOVE
|
||||
#define HAVE_MEMMOVE 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <memory.h> header file. */
|
||||
#ifndef HAVE_MEMORY_H
|
||||
#define HAVE_MEMORY_H 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <readline/history.h> header file. */
|
||||
#ifndef HAVE_READLINE_HISTORY_H
|
||||
#define HAVE_READLINE_HISTORY_H 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <readline/readline.h> header file. */
|
||||
#ifndef HAVE_READLINE_READLINE_H
|
||||
#define HAVE_READLINE_READLINE_H 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <stdint.h> header file. */
|
||||
#ifndef HAVE_STDINT_H
|
||||
#define HAVE_STDINT_H 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <stdlib.h> header file. */
|
||||
#ifndef HAVE_STDLIB_H
|
||||
#define HAVE_STDLIB_H 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the `strerror' function. */
|
||||
#ifndef HAVE_STRERROR
|
||||
#define HAVE_STRERROR 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <string> header file. */
|
||||
#ifndef HAVE_STRING
|
||||
#define HAVE_STRING 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <strings.h> header file. */
|
||||
#ifndef HAVE_STRINGS_H
|
||||
#define HAVE_STRINGS_H 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <string.h> header file. */
|
||||
#ifndef HAVE_STRING_H
|
||||
#define HAVE_STRING_H 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the `strtoll' function. */
|
||||
/* #undef HAVE_STRTOLL */
|
||||
|
||||
/* Define to 1 if you have the `strtoq' function. */
|
||||
#ifndef HAVE_STRTOQ
|
||||
#define HAVE_STRTOQ 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <sys/stat.h> header file. */
|
||||
#ifndef HAVE_SYS_STAT_H
|
||||
#define HAVE_SYS_STAT_H 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <sys/types.h> header file. */
|
||||
#ifndef HAVE_SYS_TYPES_H
|
||||
#define HAVE_SYS_TYPES_H 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <type_traits.h> header file. */
|
||||
/* #undef HAVE_TYPE_TRAITS_H */
|
||||
|
||||
/* Define to 1 if you have the <unistd.h> header file. */
|
||||
#ifndef HAVE_UNISTD_H
|
||||
#define HAVE_UNISTD_H 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if the system has the type `unsigned long long'. */
|
||||
#ifndef HAVE_UNSIGNED_LONG_LONG
|
||||
#define HAVE_UNSIGNED_LONG_LONG 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the <windows.h> header file. */
|
||||
/* #undef HAVE_WINDOWS_H */
|
||||
|
||||
/* Define to 1 if you have the <zlib.h> header file. */
|
||||
#ifndef HAVE_ZLIB_H
|
||||
#define HAVE_ZLIB_H 1
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the `_strtoi64' function. */
|
||||
/* #undef HAVE__STRTOI64 */
|
||||
|
||||
/* The value of LINK_SIZE determines the number of bytes used to store links
|
||||
as offsets within the compiled regex. The default is 2, which allows for
|
||||
compiled patterns up to 64K long. This covers the vast majority of cases.
|
||||
However, PCRE can also be compiled to use 3 or 4 bytes instead. This allows
|
||||
for longer patterns in extreme cases. On systems that support it,
|
||||
"configure" can be used to override this default. */
|
||||
#ifndef LINK_SIZE
|
||||
#define LINK_SIZE 2
|
||||
#endif
|
||||
|
||||
/* The value of MATCH_LIMIT determines the default number of times the
|
||||
internal match() function can be called during a single execution of
|
||||
pcre_exec(). There is a runtime interface for setting a different limit.
|
||||
The limit exists in order to catch runaway regular expressions that take
|
||||
for ever to determine that they do not match. The default is set very large
|
||||
so that it does not accidentally catch legitimate cases. On systems that
|
||||
support it, "configure" can be used to override this default default. */
|
||||
#ifndef MATCH_LIMIT
|
||||
#define MATCH_LIMIT 10000000
|
||||
#endif
|
||||
|
||||
/* The above limit applies to all calls of match(), whether or not they
|
||||
increase the recursion depth. In some environments it is desirable to limit
|
||||
the depth of recursive calls of match() more strictly, in order to restrict
|
||||
the maximum amount of stack (or heap, if NO_RECURSE is defined) that is
|
||||
used. The value of MATCH_LIMIT_RECURSION applies only to recursive calls of
|
||||
match(). To have any useful effect, it must be less than the value of
|
||||
MATCH_LIMIT. The default is to use the same value as MATCH_LIMIT. There is
|
||||
a runtime method for setting a different limit. On systems that support it,
|
||||
"configure" can be used to override the default. */
|
||||
#ifndef MATCH_LIMIT_RECURSION
|
||||
#define MATCH_LIMIT_RECURSION MATCH_LIMIT
|
||||
#endif
|
||||
|
||||
/* This limit is parameterized just in case anybody ever wants to change it.
|
||||
Care must be taken if it is increased, because it guards against integer
|
||||
overflow caused by enormously large patterns. */
|
||||
#ifndef MAX_NAME_COUNT
|
||||
#define MAX_NAME_COUNT 10000
|
||||
#endif
|
||||
|
||||
/* This limit is parameterized just in case anybody ever wants to change it.
|
||||
Care must be taken if it is increased, because it guards against integer
|
||||
overflow caused by enormously large patterns. */
|
||||
#ifndef MAX_NAME_SIZE
|
||||
#define MAX_NAME_SIZE 32
|
||||
#endif
|
||||
|
||||
/* The value of NEWLINE determines the newline character sequence. On systems
|
||||
that support it, "configure" can be used to override the default, which is
|
||||
10. The possible values are 10 (LF), 13 (CR), 3338 (CRLF), -1 (ANY), or -2
|
||||
(ANYCRLF). */
|
||||
#ifndef NEWLINE
|
||||
#define NEWLINE 10
|
||||
#endif
|
||||
|
||||
/* PCRE uses recursive function calls to handle backtracking while matching.
|
||||
This can sometimes be a problem on systems that have stacks of limited
|
||||
size. Define NO_RECURSE to get a version that doesn't use recursion in the
|
||||
match() function; instead it creates its own stack by steam using
|
||||
pcre_recurse_malloc() to obtain memory from the heap. For more detail, see
|
||||
the comments and other stuff just above the match() function. On systems
|
||||
that support it, "configure" can be used to set this in the Makefile (use
|
||||
--disable-stack-for-recursion). */
|
||||
/* #undef NO_RECURSE */
|
||||
|
||||
/* Name of package */
|
||||
#define PACKAGE "pcre"
|
||||
|
||||
/* Define to the address where bug reports for this package should be sent. */
|
||||
#define PACKAGE_BUGREPORT ""
|
||||
|
||||
/* Define to the full name of this package. */
|
||||
#define PACKAGE_NAME "PCRE"
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#define PACKAGE_STRING "PCRE 7.9"
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "pcre"
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "7.9"
|
||||
|
||||
|
||||
/* If you are compiling for a system other than a Unix-like system or
|
||||
Win32, and it needs some magic to be inserted before the definition
|
||||
of a function that is exported by the library, define this macro to
|
||||
contain the relevant magic. If you do not define this macro, it
|
||||
defaults to "extern" for a C compiler and "extern C" for a C++
|
||||
compiler on non-Win32 systems. This macro apears at the start of
|
||||
every exported function that is part of the external API. It does
|
||||
not appear on functions that are "external" in the C sense, but
|
||||
which are internal to the library. */
|
||||
/* #undef PCRE_EXP_DEFN */
|
||||
|
||||
/* Define if linking statically (TODO: make nice with Libtool) */
|
||||
/* #undef PCRE_STATIC */
|
||||
|
||||
/* When calling PCRE via the POSIX interface, additional working storage is
|
||||
required for holding the pointers to capturing substrings because PCRE
|
||||
requires three integers per substring, whereas the POSIX interface provides
|
||||
only two. If the number of expected substrings is small, the wrapper
|
||||
function uses space on the stack, because this is faster than using
|
||||
malloc() for each call. The threshold above which the stack is no longer
|
||||
used is defined by POSIX_MALLOC_THRESHOLD. On systems that support it,
|
||||
"configure" can be used to override this default. */
|
||||
#ifndef POSIX_MALLOC_THRESHOLD
|
||||
#define POSIX_MALLOC_THRESHOLD 10
|
||||
#endif
|
||||
|
||||
/* Define to 1 if you have the ANSI C header files. */
|
||||
#ifndef STDC_HEADERS
|
||||
#define STDC_HEADERS 1
|
||||
#endif
|
||||
|
||||
/* Define to allow pcregrep to be linked with libbz2, so that it is able to
|
||||
handle .bz2 files. */
|
||||
/* #undef SUPPORT_LIBBZ2 */
|
||||
|
||||
/* Define to allow pcretest to be linked with libreadline. */
|
||||
/* #undef SUPPORT_LIBREADLINE */
|
||||
|
||||
/* Define to allow pcregrep to be linked with libz, so that it is able to
|
||||
handle .gz files. */
|
||||
/* #undef SUPPORT_LIBZ */
|
||||
|
||||
/* Define to enable support for Unicode properties */
|
||||
/* #undef SUPPORT_UCP */
|
||||
|
||||
/* Define to enable support for the UTF-8 Unicode encoding. This will work
|
||||
even in an EBCDIC environment, but it is incompatible with the EBCDIC
|
||||
macro. That is, PCRE can support *either* EBCDIC code *or* ASCII/UTF-8, but
|
||||
not both at once. */
|
||||
/* #undef SUPPORT_UTF8 */
|
||||
|
||||
/* Version number of package */
|
||||
#ifndef VERSION
|
||||
#define VERSION "7.9"
|
||||
#endif
|
||||
|
||||
/* Define to empty if `const' does not conform to ANSI C. */
|
||||
/* #undef const */
|
||||
|
||||
/* Define to `unsigned int' if <sys/types.h> does not define. */
|
||||
/* #undef size_t */
|
||||
+212
-107
@@ -1,143 +1,248 @@
|
||||
/* config.h.in. Generated from configure.ac by autoheader. */
|
||||
|
||||
/* On Unix-like systems config.in is converted by "configure" into config.h.
|
||||
|
||||
/* On Unix-like systems config.h.in is converted by "configure" into config.h.
|
||||
Some other environments also support the use of "configure". PCRE is written in
|
||||
Standard C, but there are a few non-standard things it can cope with, allowing
|
||||
it to run on SunOS4 and other "close to standard" systems.
|
||||
|
||||
On a non-Unix-like system you should just copy this file into config.h, and set
|
||||
up the macros the way you need them. You should normally change the definitions
|
||||
of HAVE_STRERROR and HAVE_MEMMOVE to 1. Unfortunately, because of the way
|
||||
autoconf works, these cannot be made the defaults. If your system has bcopy()
|
||||
and not memmove(), change the definition of HAVE_BCOPY instead of HAVE_MEMMOVE.
|
||||
If your system has neither bcopy() nor memmove(), leave them both as 0; an
|
||||
emulation function will be used. */
|
||||
If you are going to build PCRE "by hand" on a system without "configure" you
|
||||
should copy the distributed config.h.generic to config.h, and then set up the
|
||||
macro definitions the way you need them. You must then add -DHAVE_CONFIG_H to
|
||||
all of your compile commands, so that config.h is included at the start of
|
||||
every source.
|
||||
|
||||
Alternatively, you can avoid editing by using -D on the compiler command line
|
||||
to set the macro values. In this case, you do not have to set -DHAVE_CONFIG_H.
|
||||
|
||||
PCRE uses memmove() if HAVE_MEMMOVE is set to 1; otherwise it uses bcopy() if
|
||||
HAVE_BCOPY is set to 1. If your system has neither bcopy() nor memmove(), set
|
||||
them both to 0; an emulation function will be used. */
|
||||
|
||||
/* By default, the \R escape sequence matches any Unicode line ending
|
||||
character or sequence of characters. If BSR_ANYCRLF is defined, this is
|
||||
changed so that backslash-R matches only CR, LF, or CRLF. The build- time
|
||||
default can be overridden by the user of PCRE at runtime. On systems that
|
||||
support it, "configure" can be used to override the default. */
|
||||
#undef BSR_ANYCRLF
|
||||
|
||||
/* If you are compiling for a system that uses EBCDIC instead of ASCII
|
||||
character codes, define this macro as 1. On systems that can use "configure",
|
||||
this can be done via --enable-ebcdic. */
|
||||
character codes, define this macro as 1. On systems that can use
|
||||
"configure", this can be done via --enable-ebcdic. PCRE will then assume
|
||||
that all input strings are in EBCDIC. If you do not define this macro, PCRE
|
||||
will assume input strings are ASCII or UTF-8 Unicode. It is not possible to
|
||||
build a version of PCRE that supports both EBCDIC and UTF-8. */
|
||||
#undef EBCDIC
|
||||
|
||||
#ifndef EBCDIC
|
||||
#define EBCDIC 0
|
||||
#endif
|
||||
/* Define to 1 if you have the `bcopy' function. */
|
||||
#undef HAVE_BCOPY
|
||||
|
||||
/* If you are compiling for a system other than a Unix-like system or Win32,
|
||||
and it needs some magic to be inserted before the definition of a function that
|
||||
is exported by the library, define this macro to contain the relevant magic. If
|
||||
you do not define this macro, it defaults to "extern" for a C compiler and
|
||||
"extern C" for a C++ compiler on non-Win32 systems. This macro apears at the
|
||||
start of every exported function that is part of the external API. It does not
|
||||
appear on functions that are "external" in the C sense, but which are internal
|
||||
to the library. */
|
||||
/* Define to 1 if you have the <bits/type_traits.h> header file. */
|
||||
#undef HAVE_BITS_TYPE_TRAITS_H
|
||||
|
||||
/* #define PCRE_DATA_SCOPE */
|
||||
/* Define to 1 if you have the <bzlib.h> header file. */
|
||||
#undef HAVE_BZLIB_H
|
||||
|
||||
/* Define the following macro to empty if the "const" keyword does not work. */
|
||||
/* Define to 1 if you have the <dirent.h> header file. */
|
||||
#undef HAVE_DIRENT_H
|
||||
|
||||
#undef const
|
||||
/* Define to 1 if you have the <dlfcn.h> header file. */
|
||||
#undef HAVE_DLFCN_H
|
||||
|
||||
/* Define the following macro to "unsigned" if <stddef.h> does not define
|
||||
size_t. */
|
||||
/* Define to 1 if you have the <inttypes.h> header file. */
|
||||
#undef HAVE_INTTYPES_H
|
||||
|
||||
#undef size_t
|
||||
/* Define to 1 if you have the <limits.h> header file. */
|
||||
#undef HAVE_LIMITS_H
|
||||
|
||||
/* The following two definitions are mainly for the benefit of SunOS4, which
|
||||
does not have the strerror() or memmove() functions that should be present in
|
||||
all Standard C libraries. The macros HAVE_STRERROR and HAVE_MEMMOVE should
|
||||
normally be defined with the value 1 for other systems, but unfortunately we
|
||||
cannot make this the default because "configure" files generated by autoconf
|
||||
will only change 0 to 1; they won't change 1 to 0 if the functions are not
|
||||
found. */
|
||||
/* Define to 1 if the system has the type `long long'. */
|
||||
#undef HAVE_LONG_LONG
|
||||
|
||||
#define HAVE_STRERROR 0
|
||||
#define HAVE_MEMMOVE 0
|
||||
/* Define to 1 if you have the `memmove' function. */
|
||||
#undef HAVE_MEMMOVE
|
||||
|
||||
/* There are some non-Unix-like systems that don't even have bcopy(). If this
|
||||
macro is false, an emulation is used. If HAVE_MEMMOVE is set to 1, the value of
|
||||
HAVE_BCOPY is not relevant. */
|
||||
/* Define to 1 if you have the <memory.h> header file. */
|
||||
#undef HAVE_MEMORY_H
|
||||
|
||||
#define HAVE_BCOPY 0
|
||||
/* Define to 1 if you have the <readline/history.h> header file. */
|
||||
#undef HAVE_READLINE_HISTORY_H
|
||||
|
||||
/* The value of NEWLINE determines the newline character. The default is to
|
||||
leave it up to the compiler, but some sites want to force a particular value.
|
||||
On Unix-like systems, "configure" can be used to override this default. */
|
||||
/* Define to 1 if you have the <readline/readline.h> header file. */
|
||||
#undef HAVE_READLINE_READLINE_H
|
||||
|
||||
#ifndef NEWLINE
|
||||
#define NEWLINE '\n'
|
||||
#endif
|
||||
/* Define to 1 if you have the <stdint.h> header file. */
|
||||
#undef HAVE_STDINT_H
|
||||
|
||||
/* The value of LINK_SIZE determines the number of bytes used to store links as
|
||||
offsets within the compiled regex. The default is 2, which allows for compiled
|
||||
patterns up to 64K long. This covers the vast majority of cases. However, PCRE
|
||||
can also be compiled to use 3 or 4 bytes instead. This allows for longer
|
||||
patterns in extreme cases. On systems that support it, "configure" can be used
|
||||
to override this default. */
|
||||
/* Define to 1 if you have the <stdlib.h> header file. */
|
||||
#undef HAVE_STDLIB_H
|
||||
|
||||
#ifndef LINK_SIZE
|
||||
#define LINK_SIZE 2
|
||||
#endif
|
||||
/* Define to 1 if you have the `strerror' function. */
|
||||
#undef HAVE_STRERROR
|
||||
|
||||
/* When calling PCRE via the POSIX interface, additional working storage is
|
||||
required for holding the pointers to capturing substrings because PCRE requires
|
||||
three integers per substring, whereas the POSIX interface provides only two. If
|
||||
the number of expected substrings is small, the wrapper function uses space on
|
||||
the stack, because this is faster than using malloc() for each call. The
|
||||
threshold above which the stack is no longer used is defined by POSIX_MALLOC_
|
||||
THRESHOLD. On systems that support it, "configure" can be used to override this
|
||||
default. */
|
||||
/* Define to 1 if you have the <string> header file. */
|
||||
#undef HAVE_STRING
|
||||
|
||||
#ifndef POSIX_MALLOC_THRESHOLD
|
||||
#define POSIX_MALLOC_THRESHOLD 10
|
||||
#endif
|
||||
/* Define to 1 if you have the <strings.h> header file. */
|
||||
#undef HAVE_STRINGS_H
|
||||
|
||||
/* PCRE uses recursive function calls to handle backtracking while matching.
|
||||
This can sometimes be a problem on systems that have stacks of limited size.
|
||||
Define NO_RECURSE to get a version that doesn't use recursion in the match()
|
||||
function; instead it creates its own stack by steam using pcre_recurse_malloc()
|
||||
to obtain memory from the heap. For more detail, see the comments and other
|
||||
stuff just above the match() function. On systems that support it, "configure"
|
||||
can be used to set this in the Makefile (use --disable-stack-for-recursion). */
|
||||
/* Define to 1 if you have the <string.h> header file. */
|
||||
#undef HAVE_STRING_H
|
||||
|
||||
/* #define NO_RECURSE */
|
||||
/* Define to 1 if you have the `strtoll' function. */
|
||||
#undef HAVE_STRTOLL
|
||||
|
||||
/* The value of MATCH_LIMIT determines the default number of times the internal
|
||||
match() function can be called during a single execution of pcre_exec(). There
|
||||
is a runtime interface for setting a different limit. The limit exists in order
|
||||
to catch runaway regular expressions that take for ever to determine that they
|
||||
do not match. The default is set very large so that it does not accidentally
|
||||
catch legitimate cases. On systems that support it, "configure" can be used to
|
||||
override this default default. */
|
||||
/* Define to 1 if you have the `strtoq' function. */
|
||||
#undef HAVE_STRTOQ
|
||||
|
||||
#ifndef MATCH_LIMIT
|
||||
#define MATCH_LIMIT 10000000
|
||||
#endif
|
||||
/* Define to 1 if you have the <sys/stat.h> header file. */
|
||||
#undef HAVE_SYS_STAT_H
|
||||
|
||||
/* Define to 1 if you have the <sys/types.h> header file. */
|
||||
#undef HAVE_SYS_TYPES_H
|
||||
|
||||
/* Define to 1 if you have the <type_traits.h> header file. */
|
||||
#undef HAVE_TYPE_TRAITS_H
|
||||
|
||||
/* Define to 1 if you have the <unistd.h> header file. */
|
||||
#undef HAVE_UNISTD_H
|
||||
|
||||
/* Define to 1 if the system has the type `unsigned long long'. */
|
||||
#undef HAVE_UNSIGNED_LONG_LONG
|
||||
|
||||
/* Define to 1 if you have the <windows.h> header file. */
|
||||
#undef HAVE_WINDOWS_H
|
||||
|
||||
/* Define to 1 if you have the <zlib.h> header file. */
|
||||
#undef HAVE_ZLIB_H
|
||||
|
||||
/* Define to 1 if you have the `_strtoi64' function. */
|
||||
#undef HAVE__STRTOI64
|
||||
|
||||
/* The value of LINK_SIZE determines the number of bytes used to store links
|
||||
as offsets within the compiled regex. The default is 2, which allows for
|
||||
compiled patterns up to 64K long. This covers the vast majority of cases.
|
||||
However, PCRE can also be compiled to use 3 or 4 bytes instead. This allows
|
||||
for longer patterns in extreme cases. On systems that support it,
|
||||
"configure" can be used to override this default. */
|
||||
#undef LINK_SIZE
|
||||
|
||||
/* The value of MATCH_LIMIT determines the default number of times the
|
||||
internal match() function can be called during a single execution of
|
||||
pcre_exec(). There is a runtime interface for setting a different limit.
|
||||
The limit exists in order to catch runaway regular expressions that take
|
||||
for ever to determine that they do not match. The default is set very large
|
||||
so that it does not accidentally catch legitimate cases. On systems that
|
||||
support it, "configure" can be used to override this default default. */
|
||||
#undef MATCH_LIMIT
|
||||
|
||||
/* The above limit applies to all calls of match(), whether or not they
|
||||
increase the recursion depth. In some environments it is desirable to limit the
|
||||
depth of recursive calls of match() more strictly, in order to restrict the
|
||||
maximum amount of stack (or heap, if NO_RECURSE is defined) that is used. The
|
||||
value of MATCH_LIMIT_RECURSION applies only to recursive calls of match(). To
|
||||
have any useful effect, it must be less than the value of MATCH_LIMIT. There is
|
||||
a runtime method for setting a different limit. On systems that support it,
|
||||
"configure" can be used to override this default default. */
|
||||
increase the recursion depth. In some environments it is desirable to limit
|
||||
the depth of recursive calls of match() more strictly, in order to restrict
|
||||
the maximum amount of stack (or heap, if NO_RECURSE is defined) that is
|
||||
used. The value of MATCH_LIMIT_RECURSION applies only to recursive calls of
|
||||
match(). To have any useful effect, it must be less than the value of
|
||||
MATCH_LIMIT. The default is to use the same value as MATCH_LIMIT. There is
|
||||
a runtime method for setting a different limit. On systems that support it,
|
||||
"configure" can be used to override the default. */
|
||||
#undef MATCH_LIMIT_RECURSION
|
||||
|
||||
#ifndef MATCH_LIMIT_RECURSION
|
||||
#define MATCH_LIMIT_RECURSION MATCH_LIMIT
|
||||
#endif
|
||||
/* This limit is parameterized just in case anybody ever wants to change it.
|
||||
Care must be taken if it is increased, because it guards against integer
|
||||
overflow caused by enormously large patterns. */
|
||||
#undef MAX_NAME_COUNT
|
||||
|
||||
/* These three limits are parameterized just in case anybody ever wants to
|
||||
change them. Care must be taken if they are increased, because they guard
|
||||
against integer overflow caused by enormously large patterns. */
|
||||
/* This limit is parameterized just in case anybody ever wants to change it.
|
||||
Care must be taken if it is increased, because it guards against integer
|
||||
overflow caused by enormously large patterns. */
|
||||
#undef MAX_NAME_SIZE
|
||||
|
||||
#ifndef MAX_NAME_SIZE
|
||||
#define MAX_NAME_SIZE 32
|
||||
#endif
|
||||
/* The value of NEWLINE determines the newline character sequence. On systems
|
||||
that support it, "configure" can be used to override the default, which is
|
||||
10. The possible values are 10 (LF), 13 (CR), 3338 (CRLF), -1 (ANY), or -2
|
||||
(ANYCRLF). */
|
||||
#undef NEWLINE
|
||||
|
||||
#ifndef MAX_NAME_COUNT
|
||||
#define MAX_NAME_COUNT 10000
|
||||
#endif
|
||||
/* PCRE uses recursive function calls to handle backtracking while matching.
|
||||
This can sometimes be a problem on systems that have stacks of limited
|
||||
size. Define NO_RECURSE to get a version that doesn't use recursion in the
|
||||
match() function; instead it creates its own stack by steam using
|
||||
pcre_recurse_malloc() to obtain memory from the heap. For more detail, see
|
||||
the comments and other stuff just above the match() function. On systems
|
||||
that support it, "configure" can be used to set this in the Makefile (use
|
||||
--disable-stack-for-recursion). */
|
||||
#undef NO_RECURSE
|
||||
|
||||
#ifndef MAX_DUPLENGTH
|
||||
#define MAX_DUPLENGTH 30000
|
||||
#endif
|
||||
/* Name of package */
|
||||
#undef PACKAGE
|
||||
|
||||
/* End */
|
||||
/* Define to the address where bug reports for this package should be sent. */
|
||||
#undef PACKAGE_BUGREPORT
|
||||
|
||||
/* Define to the full name of this package. */
|
||||
#undef PACKAGE_NAME
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#undef PACKAGE_STRING
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#undef PACKAGE_TARNAME
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#undef PACKAGE_VERSION
|
||||
|
||||
|
||||
/* If you are compiling for a system other than a Unix-like system or
|
||||
Win32, and it needs some magic to be inserted before the definition
|
||||
of a function that is exported by the library, define this macro to
|
||||
contain the relevant magic. If you do not define this macro, it
|
||||
defaults to "extern" for a C compiler and "extern C" for a C++
|
||||
compiler on non-Win32 systems. This macro apears at the start of
|
||||
every exported function that is part of the external API. It does
|
||||
not appear on functions that are "external" in the C sense, but
|
||||
which are internal to the library. */
|
||||
#undef PCRE_EXP_DEFN
|
||||
|
||||
/* Define if linking statically (TODO: make nice with Libtool) */
|
||||
#undef PCRE_STATIC
|
||||
|
||||
/* When calling PCRE via the POSIX interface, additional working storage is
|
||||
required for holding the pointers to capturing substrings because PCRE
|
||||
requires three integers per substring, whereas the POSIX interface provides
|
||||
only two. If the number of expected substrings is small, the wrapper
|
||||
function uses space on the stack, because this is faster than using
|
||||
malloc() for each call. The threshold above which the stack is no longer
|
||||
used is defined by POSIX_MALLOC_THRESHOLD. On systems that support it,
|
||||
"configure" can be used to override this default. */
|
||||
#undef POSIX_MALLOC_THRESHOLD
|
||||
|
||||
/* Define to 1 if you have the ANSI C header files. */
|
||||
#undef STDC_HEADERS
|
||||
|
||||
/* Define to allow pcregrep to be linked with libbz2, so that it is able to
|
||||
handle .bz2 files. */
|
||||
#undef SUPPORT_LIBBZ2
|
||||
|
||||
/* Define to allow pcretest to be linked with libreadline. */
|
||||
#undef SUPPORT_LIBREADLINE
|
||||
|
||||
/* Define to allow pcregrep to be linked with libz, so that it is able to
|
||||
handle .gz files. */
|
||||
#undef SUPPORT_LIBZ
|
||||
|
||||
/* Define to enable support for Unicode properties */
|
||||
#undef SUPPORT_UCP
|
||||
|
||||
/* Define to enable support for the UTF-8 Unicode encoding. This will work
|
||||
even in an EBCDIC environment, but it is incompatible with the EBCDIC
|
||||
macro. That is, PCRE can support *either* EBCDIC code *or* ASCII/UTF-8, but
|
||||
not both at once. */
|
||||
#undef SUPPORT_UTF8
|
||||
|
||||
/* Version number of package */
|
||||
#undef VERSION
|
||||
|
||||
/* Define to empty if `const' does not conform to ANSI C. */
|
||||
#undef const
|
||||
|
||||
/* Define to `unsigned int' if <sys/types.h> does not define. */
|
||||
#undef size_t
|
||||
|
||||
+607
-252
@@ -1,91 +1,282 @@
|
||||
|
||||
dnl Process this file with autoconf to produce a configure script.
|
||||
|
||||
dnl This configure.in file has been hacked around quite a lot as a result of
|
||||
dnl patches that various people have sent to me (PH). Sometimes the information
|
||||
dnl I get is contradictory. I've tried to put in comments that explain things,
|
||||
dnl but in some cases the information is second-hand and I have no way of
|
||||
dnl verifying it. I am not an autoconf or libtool expert!
|
||||
dnl NOTE FOR MAINTAINERS: Do not use major or minor version numbers with
|
||||
dnl leading zeros, because they may be treated as octal constants. The
|
||||
dnl PCRE_PRERELEASE feature is for identifying release candidates. It might
|
||||
dnl be defined as -RC2, for example. For real releases, it should be defined
|
||||
dnl empty.
|
||||
|
||||
dnl This is required at the start; the name is the name of a file
|
||||
dnl it should be seeing, to verify it is in the same directory.
|
||||
m4_define(pcre_major, [7])
|
||||
m4_define(pcre_minor, [9])
|
||||
m4_define(pcre_prerelease, [])
|
||||
m4_define(pcre_date, [2009-04-11])
|
||||
|
||||
AC_INIT(dftables.c)
|
||||
AC_CONFIG_SRCDIR([pcre.h])
|
||||
|
||||
dnl A safety precaution
|
||||
# Libtool shared library interface versions (current:revision:age)
|
||||
m4_define(libpcre_version, [0:1:0])
|
||||
m4_define(libpcreposix_version, [0:0:0])
|
||||
m4_define(libpcrecpp_version, [0:0:0])
|
||||
|
||||
AC_PREREQ(2.57)
|
||||
AC_INIT(PCRE, pcre_major.pcre_minor[]pcre_prerelease, , pcre)
|
||||
AC_CONFIG_SRCDIR([pcre.h.in])
|
||||
AM_INIT_AUTOMAKE([dist-bzip2 dist-zip])
|
||||
AC_CONFIG_HEADERS(config.h)
|
||||
|
||||
dnl Arrange to build config.h from config.h.in.
|
||||
dnl Manual says this macro should come right after AC_INIT.
|
||||
AC_CONFIG_HEADER(config.h)
|
||||
# The default CFLAGS and CXXFLAGS in Autoconf are "-g -O2" for gcc and just
|
||||
# "-g" for any other compiler. There doesn't seem to be a standard way of
|
||||
# getting rid of the -g (which I don't think is needed for a production
|
||||
# library). This fudge seems to achieve the necessary. First, we remember the
|
||||
# externally set values of CFLAGS and CXXFLAGS. Then call the AC_PROG_CC and
|
||||
# AC_PROG_CXX macros to find the compilers - if CFLAGS and CXXFLAGS are not
|
||||
# set, they will be set to Autoconf's defaults. Afterwards, if the original
|
||||
# values were not set, remove the -g from the Autoconf defaults.
|
||||
# (PH 02-May-07)
|
||||
|
||||
dnl Default values for miscellaneous macros
|
||||
|
||||
POSIX_MALLOC_THRESHOLD=-DPOSIX_MALLOC_THRESHOLD=10
|
||||
|
||||
dnl Provide versioning information for libtool shared libraries that
|
||||
dnl are built by default on Unix systems.
|
||||
|
||||
PCRE_LIB_VERSION=0:1:0
|
||||
PCRE_POSIXLIB_VERSION=0:0:0
|
||||
PCRE_CPPLIB_VERSION=0:0:0
|
||||
|
||||
dnl Find the PCRE version from the pcre.h file. The PCRE_VERSION variable is
|
||||
dnl substituted in pcre-config.in.
|
||||
|
||||
PCRE_MAJOR=`grep '#define PCRE_MAJOR' ${srcdir}/pcre.h | cut -c 29-`
|
||||
PCRE_MINOR=`grep '#define PCRE_MINOR' ${srcdir}/pcre.h | cut -c 29-`
|
||||
PCRE_PRERELEASE=`grep '#define PCRE_PRERELEASE' ${srcdir}/pcre.h | cut -c 29-`
|
||||
PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR}${PCRE_PRERELEASE}
|
||||
|
||||
dnl Handle --disable-cpp
|
||||
|
||||
AC_ARG_ENABLE(cpp,
|
||||
[ --disable-cpp disable C++ support],
|
||||
want_cpp="$enableval", want_cpp=yes)
|
||||
|
||||
dnl Checks for programs.
|
||||
remember_set_CFLAGS="$CFLAGS"
|
||||
remember_set_CXXFLAGS="$CXXFLAGS"
|
||||
|
||||
AC_PROG_CC
|
||||
|
||||
dnl Test for C++ for the C++ wrapper libpcrecpp. It seems, however, that
|
||||
dnl AC_PROC_CXX will set $CXX to "g++" when no C++ compiler is installed, even
|
||||
dnl though that is completely bogus. (This may happen only on certain systems
|
||||
dnl with certain versions of autoconf, of course.) An attempt to include this
|
||||
dnl test inside a check for want_cpp was criticized by a libtool expert, who
|
||||
dnl tells me that it isn't allowed.
|
||||
|
||||
AC_PROG_CXX
|
||||
|
||||
dnl The icc compiler has the same options as gcc, so let the rest of the
|
||||
dnl configure script think it has gcc when setting up dnl options etc.
|
||||
dnl This is a nasty hack which no longer seems necessary with the update
|
||||
dnl to the latest libtool files, so I have commented it out.
|
||||
dnl
|
||||
dnl if test "$CC" = "icc" ; then GCC=yes ; fi
|
||||
if test "x$remember_set_CFLAGS" = "x"
|
||||
then
|
||||
if test "$CFLAGS" = "-g -O2"
|
||||
then
|
||||
CFLAGS="-O2"
|
||||
elif test "$CFLAGS" = "-g"
|
||||
then
|
||||
CFLAGS=""
|
||||
fi
|
||||
fi
|
||||
|
||||
if test "x$remember_set_CXXFLAGS" = "x"
|
||||
then
|
||||
if test "$CXXFLAGS" = "-g -O2"
|
||||
then
|
||||
CXXFLAGS="-O2"
|
||||
elif test "$CXXFLAGS" = "-g"
|
||||
then
|
||||
CXXFLAGS=""
|
||||
fi
|
||||
fi
|
||||
|
||||
# AC_PROG_CXX will return "g++" even if no c++ compiler is installed.
|
||||
# Check for that case, and just disable c++ code if g++ doesn't run.
|
||||
AC_LANG_PUSH(C++)
|
||||
AC_COMPILE_IFELSE(AC_LANG_PROGRAM([],[]),, CXX=""; CXXCP=""; CXXFLAGS="")
|
||||
AC_LANG_POP
|
||||
|
||||
AC_PROG_INSTALL
|
||||
AC_LIBTOOL_WIN32_DLL
|
||||
AC_PROG_LIBTOOL
|
||||
AC_PROG_LN_S
|
||||
|
||||
dnl We need to find a compiler for compiling a program to run on the local host
|
||||
dnl while building. It needs to be different from CC when cross-compiling.
|
||||
dnl There is a macro called AC_PROG_CC_FOR_BUILD in the GNU archive for
|
||||
dnl figuring this out automatically. Unfortunately, it does not work with the
|
||||
dnl latest versions of autoconf. So for the moment, we just default to the
|
||||
dnl same values as the "main" compiler. People who are cross-compiling will
|
||||
dnl just have to adjust the Makefile by hand or set these values when they
|
||||
dnl run "configure".
|
||||
PCRE_MAJOR="pcre_major"
|
||||
PCRE_MINOR="pcre_minor"
|
||||
PCRE_PRERELEASE="pcre_prerelease"
|
||||
PCRE_DATE="pcre_date"
|
||||
|
||||
CC_FOR_BUILD=${CC_FOR_BUILD:-'$(CC)'}
|
||||
CXX_FOR_BUILD=${CXX_FOR_BUILD:-'$(CXX)'}
|
||||
CFLAGS_FOR_BUILD=${CFLAGS_FOR_BUILD:-'$(CFLAGS)'}
|
||||
CPPFLAGS_FOR_BUILD=${CFLAGS_FOR_BUILD:-'$(CPPFLAGS)'}
|
||||
CXXFLAGS_FOR_BUILD=${CXXFLAGS_FOR_BUILD:-'$(CXXFLAGS)'}
|
||||
BUILD_EXEEXT=${BUILD_EXEEXT:-'$(EXEEXT)'}
|
||||
BUILD_OBJEXT=${BUILD_OBJEXT:-'$(OBJEXT)'}
|
||||
AC_SUBST(PCRE_MAJOR)
|
||||
AC_SUBST(PCRE_MINOR)
|
||||
AC_SUBST(PCRE_PRERELEASE)
|
||||
AC_SUBST(PCRE_DATE)
|
||||
|
||||
# Set a more sensible default value for $(htmldir).
|
||||
if test "x$htmldir" = 'x${docdir}'
|
||||
then
|
||||
htmldir='${docdir}/html'
|
||||
fi
|
||||
|
||||
# Handle --disable-cpp
|
||||
AC_ARG_ENABLE(cpp,
|
||||
AS_HELP_STRING([--disable-cpp],
|
||||
[disable C++ support]),
|
||||
, enable_cpp=yes)
|
||||
|
||||
# Handle --enable-rebuild-chartables
|
||||
AC_ARG_ENABLE(rebuild-chartables,
|
||||
AS_HELP_STRING([--enable-rebuild-chartables],
|
||||
[rebuild character tables in current locale]),
|
||||
, enable_rebuild_chartables=no)
|
||||
|
||||
# Handle --enable-utf8 (disabled by default)
|
||||
AC_ARG_ENABLE(utf8,
|
||||
AS_HELP_STRING([--enable-utf8],
|
||||
[enable UTF-8 support (incompatible with --enable-ebcdic)]),
|
||||
, enable_utf8=unset)
|
||||
|
||||
# Handle --enable-unicode-properties
|
||||
AC_ARG_ENABLE(unicode-properties,
|
||||
AS_HELP_STRING([--enable-unicode-properties],
|
||||
[enable Unicode properties support (implies --enable-utf8)]),
|
||||
, enable_unicode_properties=no)
|
||||
|
||||
# Handle --enable-newline=NL
|
||||
dnl AC_ARG_ENABLE(newline,
|
||||
dnl AS_HELP_STRING([--enable-newline=NL],
|
||||
dnl [use NL as newline (lf, cr, crlf, anycrlf, any; default=lf)]),
|
||||
dnl , enable_newline=lf)
|
||||
|
||||
# Separate newline options
|
||||
ac_pcre_newline=lf
|
||||
AC_ARG_ENABLE(newline-is-cr,
|
||||
AS_HELP_STRING([--enable-newline-is-cr],
|
||||
[use CR as newline character]),
|
||||
ac_pcre_newline=cr)
|
||||
AC_ARG_ENABLE(newline-is-lf,
|
||||
AS_HELP_STRING([--enable-newline-is-lf],
|
||||
[use LF as newline character (default)]),
|
||||
ac_pcre_newline=lf)
|
||||
AC_ARG_ENABLE(newline-is-crlf,
|
||||
AS_HELP_STRING([--enable-newline-is-crlf],
|
||||
[use CRLF as newline sequence]),
|
||||
ac_pcre_newline=crlf)
|
||||
AC_ARG_ENABLE(newline-is-anycrlf,
|
||||
AS_HELP_STRING([--enable-newline-is-anycrlf],
|
||||
[use CR, LF, or CRLF as newline sequence]),
|
||||
ac_pcre_newline=anycrlf)
|
||||
AC_ARG_ENABLE(newline-is-any,
|
||||
AS_HELP_STRING([--enable-newline-is-any],
|
||||
[use any valid Unicode newline sequence]),
|
||||
ac_pcre_newline=any)
|
||||
enable_newline="$ac_pcre_newline"
|
||||
|
||||
# Handle --enable-bsr-anycrlf
|
||||
AC_ARG_ENABLE(bsr-anycrlf,
|
||||
AS_HELP_STRING([--enable-bsr-anycrlf],
|
||||
[\R matches only CR, LF, CRLF by default]),
|
||||
, enable_bsr_anycrlf=no)
|
||||
|
||||
# Handle --enable-ebcdic
|
||||
AC_ARG_ENABLE(ebcdic,
|
||||
AS_HELP_STRING([--enable-ebcdic],
|
||||
[assume EBCDIC coding rather than ASCII; incompatible with --enable-utf8; use only in (uncommon) EBCDIC environments; it implies --enable-rebuild-chartables]),
|
||||
, enable_ebcdic=no)
|
||||
|
||||
# Handle --disable-stack-for-recursion
|
||||
AC_ARG_ENABLE(stack-for-recursion,
|
||||
AS_HELP_STRING([--disable-stack-for-recursion],
|
||||
[don't use stack recursion when matching]),
|
||||
, enable_stack_for_recursion=yes)
|
||||
|
||||
# Handle --enable-pcregrep-libz
|
||||
AC_ARG_ENABLE(pcregrep-libz,
|
||||
AS_HELP_STRING([--enable-pcregrep-libz],
|
||||
[link pcregrep with libz to handle .gz files]),
|
||||
, enable_pcregrep_libz=no)
|
||||
|
||||
# Handle --enable-pcregrep-libbz2
|
||||
AC_ARG_ENABLE(pcregrep-libbz2,
|
||||
AS_HELP_STRING([--enable-pcregrep-libbz2],
|
||||
[link pcregrep with libbz2 to handle .bz2 files]),
|
||||
, enable_pcregrep_libbz2=no)
|
||||
|
||||
# Handle --enable-pcretest-libreadline
|
||||
AC_ARG_ENABLE(pcretest-libreadline,
|
||||
AS_HELP_STRING([--enable-pcretest-libreadline],
|
||||
[link pcretest with libreadline]),
|
||||
, enable_pcretest_libreadline=no)
|
||||
|
||||
# Handle --with-posix-malloc-threshold=NBYTES
|
||||
AC_ARG_WITH(posix-malloc-threshold,
|
||||
AS_HELP_STRING([--with-posix-malloc-threshold=NBYTES],
|
||||
[threshold for POSIX malloc usage (default=10)]),
|
||||
, with_posix_malloc_threshold=10)
|
||||
|
||||
# Handle --with-link-size=N
|
||||
AC_ARG_WITH(link-size,
|
||||
AS_HELP_STRING([--with-link-size=N],
|
||||
[internal link size (2, 3, or 4 allowed; default=2)]),
|
||||
, with_link_size=2)
|
||||
|
||||
# Handle --with-match-limit=N
|
||||
AC_ARG_WITH(match-limit,
|
||||
AS_HELP_STRING([--with-match-limit=N],
|
||||
[default limit on internal looping (default=10000000)]),
|
||||
, with_match_limit=10000000)
|
||||
|
||||
# Handle --with-match-limit_recursion=N
|
||||
#
|
||||
# Note: In config.h, the default is to define MATCH_LIMIT_RECURSION
|
||||
# symbolically as MATCH_LIMIT, which in turn is defined to be some numeric
|
||||
# value (e.g. 10000000). MATCH_LIMIT_RECURSION can otherwise be set to some
|
||||
# different numeric value (or even the same numeric value as MATCH_LIMIT,
|
||||
# though no longer defined in terms of the latter).
|
||||
#
|
||||
AC_ARG_WITH(match-limit-recursion,
|
||||
AS_HELP_STRING([--with-match-limit-recursion=N],
|
||||
[default limit on internal recursion (default=MATCH_LIMIT)]),
|
||||
, with_match_limit_recursion=MATCH_LIMIT)
|
||||
|
||||
# Make sure that if enable_unicode_properties was set, that UTF-8 support
|
||||
# is enabled.
|
||||
#
|
||||
if test "x$enable_unicode_properties" = "xyes"
|
||||
then
|
||||
if test "x$enable_utf8" = "xno"
|
||||
then
|
||||
AC_MSG_ERROR([support for Unicode properties requires UTF-8 support])
|
||||
fi
|
||||
enable_utf8=yes
|
||||
fi
|
||||
|
||||
if test "x$enable_utf8" = "xunset"
|
||||
then
|
||||
enable_utf8=no
|
||||
fi
|
||||
|
||||
# Make sure that if enable_ebcdic is set, rebuild_chartables is also enabled.
|
||||
# Also check that UTF-8 support is not requested, because PCRE cannot handle
|
||||
# EBCDIC and UTF-8 in the same build. To do so it would need to use different
|
||||
# character constants depending on the mode.
|
||||
#
|
||||
if test "x$enable_ebcdic" = "xyes"
|
||||
then
|
||||
enable_rebuild_chartables=yes
|
||||
if test "x$enable_utf8" = "xyes"
|
||||
then
|
||||
AC_MSG_ERROR([support for EBCDIC and UTF-8 cannot be enabled at the same time])
|
||||
fi
|
||||
fi
|
||||
|
||||
# Convert the newline identifier into the appropriate integer value.
|
||||
case "$enable_newline" in
|
||||
lf) ac_pcre_newline_value=10 ;;
|
||||
cr) ac_pcre_newline_value=13 ;;
|
||||
crlf) ac_pcre_newline_value=3338 ;;
|
||||
anycrlf) ac_pcre_newline_value=-2 ;;
|
||||
any) ac_pcre_newline_value=-1 ;;
|
||||
*)
|
||||
AC_MSG_ERROR([invalid argument \"$enable_newline\" to --enable-newline option])
|
||||
;;
|
||||
esac
|
||||
|
||||
# Check argument to --with-link-size
|
||||
case "$with_link_size" in
|
||||
2|3|4) ;;
|
||||
*)
|
||||
AC_MSG_ERROR([invalid argument \"$with_link_size\" to --with-link-size option])
|
||||
;;
|
||||
esac
|
||||
|
||||
AH_TOP([
|
||||
/* On Unix-like systems config.h.in is converted by "configure" into config.h.
|
||||
Some other environments also support the use of "configure". PCRE is written in
|
||||
Standard C, but there are a few non-standard things it can cope with, allowing
|
||||
it to run on SunOS4 and other "close to standard" systems.
|
||||
|
||||
If you are going to build PCRE "by hand" on a system without "configure" you
|
||||
should copy the distributed config.h.generic to config.h, and then set up the
|
||||
macro definitions the way you need them. You must then add -DHAVE_CONFIG_H to
|
||||
all of your compile commands, so that config.h is included at the start of
|
||||
every source.
|
||||
|
||||
Alternatively, you can avoid editing by using -D on the compiler command line
|
||||
to set the macro values. In this case, you do not have to set -DHAVE_CONFIG_H.
|
||||
|
||||
PCRE uses memmove() if HAVE_MEMMOVE is set to 1; otherwise it uses bcopy() if
|
||||
HAVE_BCOPY is set to 1. If your system has neither bcopy() nor memmove(), set
|
||||
them both to 0; an emulation function will be used. */])
|
||||
|
||||
AC_DEFUN([AX_COMPILER_VENDOR],
|
||||
[
|
||||
@@ -116,217 +307,381 @@ if test "x${ax_cv_c_compiler_vendor}" = "xsun" ; then
|
||||
fi
|
||||
fi
|
||||
|
||||
dnl Checks for header files.
|
||||
|
||||
# Checks for header files.
|
||||
AC_HEADER_STDC
|
||||
AC_CHECK_HEADERS(limits.h)
|
||||
AC_CHECK_HEADERS(limits.h sys/types.h sys/stat.h dirent.h windows.h)
|
||||
|
||||
dnl The files below are C++ header files. One person told me (PH) that
|
||||
dnl AC_LANG_CPLUSPLUS unsets CXX if it was explicitly set to something which
|
||||
dnl doesn't work. However, this doesn't always seem to be the case.
|
||||
|
||||
if test "x$want_cpp" = "xyes" -a -n "$CXX"
|
||||
# The files below are C++ header files.
|
||||
pcre_have_type_traits="0"
|
||||
pcre_have_bits_type_traits="0"
|
||||
if test "x$enable_cpp" = "xyes" -a -n "$CXX"
|
||||
then
|
||||
AC_LANG_SAVE
|
||||
AC_LANG_CPLUSPLUS
|
||||
AC_LANG_PUSH(C++)
|
||||
|
||||
dnl We could be more clever here, given we're doing AC_SUBST with this
|
||||
dnl (eg set a var to be the name of the include file we want). But we're not
|
||||
dnl so it's easy to change back to 'regular' autoconf vars if we needed to.
|
||||
# Older versions of pcre defined pcrecpp::no_arg, but in new versions
|
||||
# it's called pcrecpp::RE::no_arg. For backwards ABI compatibility,
|
||||
# we want to make one an alias for the other. Different systems do
|
||||
# this in different ways. Some systems, for instance, can do it via
|
||||
# a linker flag: -alias (for os x 10.5) or -i (for os x <=10.4).
|
||||
OLD_LDFLAGS="$LDFLAGS"
|
||||
for flag in "-alias,__ZN7pcrecpp2RE6no_argE,__ZN7pcrecpp6no_argE" \
|
||||
"-i__ZN7pcrecpp6no_argE:__ZN7pcrecpp2RE6no_argE"; do
|
||||
AC_MSG_CHECKING([for alias support in the linker])
|
||||
LDFLAGS="$OLD_LDFLAGS -Wl,$flag"
|
||||
# We try to run the linker with this new ld flag. If the link fails,
|
||||
# we give up and remove the new flag from LDFLAGS.
|
||||
AC_LINK_IFELSE(AC_LANG_PROGRAM([namespace pcrecpp {
|
||||
class RE { static int no_arg; };
|
||||
int RE::no_arg;
|
||||
}],
|
||||
[]),
|
||||
[AC_MSG_RESULT([yes]);
|
||||
EXTRA_LIBPCRECPP_LDFLAGS="$EXTRA_LIBPCRECPP_LDFLAGS -Wl,$flag";
|
||||
break;],
|
||||
AC_MSG_RESULT([no]))
|
||||
done
|
||||
LDFLAGS="$OLD_LDFLAGS"
|
||||
|
||||
# We could be more clever here, given we're doing AC_SUBST with this
|
||||
# (eg set a var to be the name of the include file we want). But we're not
|
||||
# so it's easy to change back to 'regular' autoconf vars if we needed to.
|
||||
AC_CHECK_HEADERS(string, [pcre_have_cpp_headers="1"],
|
||||
[pcre_have_cpp_headers="0"])
|
||||
AC_CHECK_HEADERS(bits/type_traits.h, [pcre_have_bits_type_traits="1"],
|
||||
[pcre_have_bits_type_traits="0"])
|
||||
AC_CHECK_HEADERS(type_traits.h, [pcre_have_type_traits="1"],
|
||||
[pcre_have_type_traits="0"])
|
||||
dnl Using AC_SUBST eliminates the need to include config.h in a public .h file
|
||||
AC_SUBST(pcre_have_bits_type_traits)
|
||||
|
||||
AC_LANG_POP
|
||||
fi
|
||||
# Using AC_SUBST eliminates the need to include config.h in a public .h file
|
||||
AC_SUBST(pcre_have_type_traits)
|
||||
AC_LANG_RESTORE
|
||||
fi
|
||||
AC_SUBST(pcre_have_bits_type_traits)
|
||||
|
||||
dnl From the above, we now have enough info to know if C++ is fully installed
|
||||
if test "x$want_cpp" = "xyes" -a -n "$CXX" -a "$pcre_have_cpp_headers" = 1; then
|
||||
MAYBE_CPP_TARGETS='$(CPP_TARGETS)'
|
||||
HAVE_CPP=
|
||||
else
|
||||
MAYBE_CPP_TARGETS=
|
||||
HAVE_CPP="#"
|
||||
fi
|
||||
AC_SUBST(MAYBE_CPP_TARGETS)
|
||||
AC_SUBST(HAVE_CPP)
|
||||
# Conditional compilation
|
||||
AM_CONDITIONAL(WITH_PCRE_CPP, test "x$enable_cpp" = "xyes")
|
||||
AM_CONDITIONAL(WITH_REBUILD_CHARTABLES, test "x$enable_rebuild_chartables" = "xyes")
|
||||
|
||||
dnl Checks for typedefs, structures, and compiler characteristics.
|
||||
# Checks for typedefs, structures, and compiler characteristics.
|
||||
|
||||
AC_C_CONST
|
||||
AC_TYPE_SIZE_T
|
||||
|
||||
AC_CHECK_TYPES([long long], [pcre_have_long_long="1"], [pcre_have_long_long="0"])
|
||||
AC_CHECK_TYPES([unsigned long long], [pcre_have_ulong_long="1"], [pcre_have_ulong_long="0"])
|
||||
pcre_have_strotolonglong=0
|
||||
AC_CHECK_FUNCS(strtoq strtoll _strtoi64, [pcre_have_strotolonglong="1"; break])
|
||||
# If we can't convert a string to a long long, pretend we don't even
|
||||
# have a long long.
|
||||
if test $pcre_have_strotolonglong = "0"; then
|
||||
pcre_have_long_long="0"
|
||||
pcre_have_ulong_long="0"
|
||||
else
|
||||
AC_CHECK_TYPES([long long],
|
||||
[pcre_have_long_long="1"],
|
||||
[pcre_have_long_long="0"])
|
||||
AC_CHECK_TYPES([unsigned long long],
|
||||
[pcre_have_ulong_long="1"],
|
||||
[pcre_have_ulong_long="0"])
|
||||
fi
|
||||
AC_SUBST(pcre_have_long_long)
|
||||
AC_SUBST(pcre_have_ulong_long)
|
||||
|
||||
dnl Checks for library functions.
|
||||
# Checks for library functions.
|
||||
|
||||
AC_CHECK_FUNCS(bcopy memmove strerror strtoq strtoll)
|
||||
AC_CHECK_FUNCS(bcopy memmove strerror)
|
||||
|
||||
dnl Handle --enable-utf8
|
||||
# Check for the availability of libz (aka zlib)
|
||||
|
||||
AC_ARG_ENABLE(utf8,
|
||||
[ --enable-utf8 enable UTF8 support],
|
||||
if test "$enableval" = "yes"; then
|
||||
UTF8=-DSUPPORT_UTF8
|
||||
fi
|
||||
)
|
||||
AC_CHECK_HEADERS([zlib.h], [HAVE_ZLIB_H=1])
|
||||
AC_CHECK_LIB([z], [gzopen], [HAVE_LIBZ=1])
|
||||
|
||||
dnl Handle --enable-unicode-properties
|
||||
# Check for the availability of libbz2
|
||||
|
||||
AC_ARG_ENABLE(unicode-properties,
|
||||
[ --enable-unicode-properties enable Unicode properties support],
|
||||
if test "$enableval" = "yes"; then
|
||||
UCP=-DSUPPORT_UCP
|
||||
fi
|
||||
)
|
||||
AC_CHECK_HEADERS([bzlib.h], [HAVE_BZLIB_H=1])
|
||||
AC_CHECK_LIB([bz2], [BZ2_bzopen], [HAVE_LIBBZ2=1])
|
||||
|
||||
dnl Handle --enable-newline-is-cr
|
||||
# Check for the availabiity of libreadline
|
||||
|
||||
AC_ARG_ENABLE(newline-is-cr,
|
||||
[ --enable-newline-is-cr use CR as the newline character],
|
||||
if test "$enableval" = "yes"; then
|
||||
NEWLINE=-DNEWLINE=13
|
||||
fi
|
||||
)
|
||||
AC_CHECK_HEADERS([readline/readline.h], [HAVE_READLINE_H=1])
|
||||
AC_CHECK_HEADERS([readline/history.h], [HAVE_HISTORY_H=1])
|
||||
AC_CHECK_LIB([readline], [readline], [HAVE_LIB_READLINE=1])
|
||||
|
||||
dnl Handle --enable-newline-is-lf
|
||||
|
||||
AC_ARG_ENABLE(newline-is-lf,
|
||||
[ --enable-newline-is-lf use LF as the newline character],
|
||||
if test "$enableval" = "yes"; then
|
||||
NEWLINE=-DNEWLINE=10
|
||||
fi
|
||||
)
|
||||
|
||||
dnl Handle --enable-newline-is-crlf
|
||||
|
||||
AC_ARG_ENABLE(newline-is-crlf,
|
||||
[ --enable-newline-is-crlf use CRLF as the newline sequence],
|
||||
if test "$enableval" = "yes"; then
|
||||
NEWLINE=-DNEWLINE=3338
|
||||
fi
|
||||
)
|
||||
|
||||
dnl Handle --enable-ebcdic
|
||||
|
||||
AC_ARG_ENABLE(ebcdic,
|
||||
[ --enable-ebcdic assume EBCDIC coding rather than ASCII],
|
||||
if test "$enableval" == "yes"; then
|
||||
EBCDIC=-DEBCDIC=1
|
||||
fi
|
||||
)
|
||||
|
||||
dnl Handle --disable-stack-for-recursion
|
||||
|
||||
AC_ARG_ENABLE(stack-for-recursion,
|
||||
[ --disable-stack-for-recursion disable use of stack recursion when matching],
|
||||
if test "$enableval" = "no"; then
|
||||
NO_RECURSE=-DNO_RECURSE
|
||||
fi
|
||||
)
|
||||
|
||||
dnl There doesn't seem to be a straightforward way of having parameters
|
||||
dnl that set values, other than fudging the --with thing. So that's what
|
||||
dnl I've done.
|
||||
|
||||
dnl Handle --with-posix-malloc-threshold=n
|
||||
|
||||
AC_ARG_WITH(posix-malloc-threshold,
|
||||
[ --with-posix-malloc-threshold=10 threshold for POSIX malloc usage],
|
||||
POSIX_MALLOC_THRESHOLD=-DPOSIX_MALLOC_THRESHOLD=$withval
|
||||
)
|
||||
|
||||
dnl Handle --with-link-size=n
|
||||
|
||||
AC_ARG_WITH(link-size,
|
||||
[ --with-link-size=2 internal link size (2, 3, or 4 allowed)],
|
||||
LINK_SIZE=-DLINK_SIZE=$withval
|
||||
)
|
||||
|
||||
dnl Handle --with-match-limit=n
|
||||
|
||||
AC_ARG_WITH(match-limit,
|
||||
[ --with-match-limit=10000000 default limit on internal looping],
|
||||
MATCH_LIMIT=-DMATCH_LIMIT=$withval
|
||||
)
|
||||
|
||||
dnl Handle --with-match-limit_recursion=n
|
||||
|
||||
AC_ARG_WITH(match-limit-recursion,
|
||||
[ --with-match-limit-recursion=10000000 default limit on internal recursion],
|
||||
MATCH_LIMIT_RECURSION=-DMATCH_LIMIT_RECURSION=$withval
|
||||
)
|
||||
|
||||
dnl Unicode character property support implies UTF-8 support
|
||||
|
||||
if test "$UCP" != "" ; then
|
||||
UTF8=-DSUPPORT_UTF8
|
||||
fi
|
||||
|
||||
dnl "Export" these variables
|
||||
|
||||
AC_SUBST(BUILD_EXEEXT)
|
||||
AC_SUBST(BUILD_OBJEXT)
|
||||
AC_SUBST(CC_FOR_BUILD)
|
||||
AC_SUBST(CXX_FOR_BUILD)
|
||||
AC_SUBST(CFLAGS_FOR_BUILD)
|
||||
AC_SUBST(CXXFLAGS_FOR_BUILD)
|
||||
AC_SUBST(CXXLDFLAGS)
|
||||
AC_SUBST(EBCDIC)
|
||||
AC_SUBST(HAVE_MEMMOVE)
|
||||
AC_SUBST(HAVE_STRERROR)
|
||||
AC_SUBST(LINK_SIZE)
|
||||
AC_SUBST(MATCH_LIMIT)
|
||||
AC_SUBST(MATCH_LIMIT_RECURSION)
|
||||
AC_SUBST(NEWLINE)
|
||||
AC_SUBST(NO_RECURSE)
|
||||
AC_SUBST(PCRE_LIB_VERSION)
|
||||
AC_SUBST(PCRE_POSIXLIB_VERSION)
|
||||
AC_SUBST(PCRE_CPPLIB_VERSION)
|
||||
AC_SUBST(PCRE_VERSION)
|
||||
AC_SUBST(POSIX_MALLOC_THRESHOLD)
|
||||
AC_SUBST(UCP)
|
||||
AC_SUBST(UTF8)
|
||||
|
||||
dnl Stuff to make MinGW work better. Special treatment is no longer
|
||||
dnl needed for Cygwin.
|
||||
|
||||
case $host_os in
|
||||
mingw* )
|
||||
POSIX_OBJ=pcreposix.o
|
||||
POSIX_LOBJ=pcreposix.lo
|
||||
POSIX_LIB=
|
||||
ON_WINDOWS=
|
||||
NOT_ON_WINDOWS="#"
|
||||
WIN_PREFIX=
|
||||
;;
|
||||
* )
|
||||
ON_WINDOWS="#"
|
||||
NOT_ON_WINDOWS=
|
||||
POSIX_OBJ=
|
||||
POSIX_LOBJ=
|
||||
POSIX_LIB=libpcreposix.la
|
||||
WIN_PREFIX=
|
||||
;;
|
||||
esac
|
||||
AC_SUBST(WIN_PREFIX)
|
||||
AC_SUBST(ON_WINDOWS)
|
||||
AC_SUBST(NOT_ON_WINDOWS)
|
||||
AC_SUBST(POSIX_OBJ)
|
||||
AC_SUBST(POSIX_LOBJ)
|
||||
AC_SUBST(POSIX_LIB)
|
||||
# This facilitates -ansi builds under Linux
|
||||
dnl AC_DEFINE([_GNU_SOURCE], [], [Enable GNU extensions in glibc])
|
||||
|
||||
if test "x$enable_shared" = "xno" ; then
|
||||
AC_DEFINE([PCRE_STATIC],[1],[to link statically])
|
||||
AC_DEFINE([PCRE_STATIC], [1], [
|
||||
Define if linking statically (TODO: make nice with Libtool)])
|
||||
fi
|
||||
|
||||
dnl This must be last; it determines what files are written as well as config.h
|
||||
AC_OUTPUT(Makefile pcre-config:pcre-config.in libpcre.pc:libpcre.pc.in pcrecpparg.h:pcrecpparg.h.in pcre_stringpiece.h:pcre_stringpiece.h.in RunGrepTest:RunGrepTest.in RunTest:RunTest.in,[chmod a+x RunTest RunGrepTest pcre-config])
|
||||
# Here is where pcre specific defines are handled
|
||||
|
||||
if test "$enable_utf8" = "yes"; then
|
||||
AC_DEFINE([SUPPORT_UTF8], [], [
|
||||
Define to enable support for the UTF-8 Unicode encoding. This will
|
||||
work even in an EBCDIC environment, but it is incompatible with
|
||||
the EBCDIC macro. That is, PCRE can support *either* EBCDIC code
|
||||
*or* ASCII/UTF-8, but not both at once.])
|
||||
fi
|
||||
|
||||
if test "$enable_unicode_properties" = "yes"; then
|
||||
AC_DEFINE([SUPPORT_UCP], [], [
|
||||
Define to enable support for Unicode properties])
|
||||
fi
|
||||
|
||||
if test "$enable_stack_for_recursion" = "no"; then
|
||||
AC_DEFINE([NO_RECURSE], [], [
|
||||
PCRE uses recursive function calls to handle backtracking while
|
||||
matching. This can sometimes be a problem on systems that have
|
||||
stacks of limited size. Define NO_RECURSE to get a version that
|
||||
doesn't use recursion in the match() function; instead it creates
|
||||
its own stack by steam using pcre_recurse_malloc() to obtain memory
|
||||
from the heap. For more detail, see the comments and other stuff
|
||||
just above the match() function. On systems that support it,
|
||||
"configure" can be used to set this in the Makefile
|
||||
(use --disable-stack-for-recursion).])
|
||||
fi
|
||||
|
||||
if test "$enable_pcregrep_libz" = "yes"; then
|
||||
AC_DEFINE([SUPPORT_LIBZ], [], [
|
||||
Define to allow pcregrep to be linked with libz, so that it is
|
||||
able to handle .gz files.])
|
||||
fi
|
||||
|
||||
if test "$enable_pcregrep_libbz2" = "yes"; then
|
||||
AC_DEFINE([SUPPORT_LIBBZ2], [], [
|
||||
Define to allow pcregrep to be linked with libbz2, so that it is
|
||||
able to handle .bz2 files.])
|
||||
fi
|
||||
|
||||
if test "$enable_pcretest_libreadline" = "yes"; then
|
||||
AC_DEFINE([SUPPORT_LIBREADLINE], [], [
|
||||
Define to allow pcretest to be linked with libreadline.])
|
||||
fi
|
||||
|
||||
AC_DEFINE_UNQUOTED([NEWLINE], [$ac_pcre_newline_value], [
|
||||
The value of NEWLINE determines the newline character sequence. On
|
||||
systems that support it, "configure" can be used to override the
|
||||
default, which is 10. The possible values are 10 (LF), 13 (CR),
|
||||
3338 (CRLF), -1 (ANY), or -2 (ANYCRLF).])
|
||||
|
||||
if test "$enable_bsr_anycrlf" = "yes"; then
|
||||
AC_DEFINE([BSR_ANYCRLF], [], [
|
||||
By default, the \R escape sequence matches any Unicode line ending
|
||||
character or sequence of characters. If BSR_ANYCRLF is defined, this is
|
||||
changed so that backslash-R matches only CR, LF, or CRLF. The build-
|
||||
time default can be overridden by the user of PCRE at runtime. On
|
||||
systems that support it, "configure" can be used to override the
|
||||
default.])
|
||||
fi
|
||||
|
||||
AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [
|
||||
The value of LINK_SIZE determines the number of bytes used to store
|
||||
links as offsets within the compiled regex. The default is 2, which
|
||||
allows for compiled patterns up to 64K long. This covers the vast
|
||||
majority of cases. However, PCRE can also be compiled to use 3 or 4
|
||||
bytes instead. This allows for longer patterns in extreme cases. On
|
||||
systems that support it, "configure" can be used to override this default.])
|
||||
|
||||
AC_DEFINE_UNQUOTED([POSIX_MALLOC_THRESHOLD], [$with_posix_malloc_threshold], [
|
||||
When calling PCRE via the POSIX interface, additional working storage
|
||||
is required for holding the pointers to capturing substrings because
|
||||
PCRE requires three integers per substring, whereas the POSIX
|
||||
interface provides only two. If the number of expected substrings is
|
||||
small, the wrapper function uses space on the stack, because this is
|
||||
faster than using malloc() for each call. The threshold above which
|
||||
the stack is no longer used is defined by POSIX_MALLOC_THRESHOLD. On
|
||||
systems that support it, "configure" can be used to override this
|
||||
default.])
|
||||
|
||||
AC_DEFINE_UNQUOTED([MATCH_LIMIT], [$with_match_limit], [
|
||||
The value of MATCH_LIMIT determines the default number of times the
|
||||
internal match() function can be called during a single execution of
|
||||
pcre_exec(). There is a runtime interface for setting a different
|
||||
limit. The limit exists in order to catch runaway regular
|
||||
expressions that take for ever to determine that they do not match.
|
||||
The default is set very large so that it does not accidentally catch
|
||||
legitimate cases. On systems that support it, "configure" can be
|
||||
used to override this default default.])
|
||||
|
||||
AC_DEFINE_UNQUOTED([MATCH_LIMIT_RECURSION], [$with_match_limit_recursion], [
|
||||
The above limit applies to all calls of match(), whether or not they
|
||||
increase the recursion depth. In some environments it is desirable
|
||||
to limit the depth of recursive calls of match() more strictly, in
|
||||
order to restrict the maximum amount of stack (or heap, if
|
||||
NO_RECURSE is defined) that is used. The value of
|
||||
MATCH_LIMIT_RECURSION applies only to recursive calls of match(). To
|
||||
have any useful effect, it must be less than the value of
|
||||
MATCH_LIMIT. The default is to use the same value as MATCH_LIMIT.
|
||||
There is a runtime method for setting a different limit. On systems
|
||||
that support it, "configure" can be used to override the default.])
|
||||
|
||||
AC_DEFINE([MAX_NAME_SIZE], [32], [
|
||||
This limit is parameterized just in case anybody ever wants to
|
||||
change it. Care must be taken if it is increased, because it guards
|
||||
against integer overflow caused by enormously large patterns.])
|
||||
|
||||
AC_DEFINE([MAX_NAME_COUNT], [10000], [
|
||||
This limit is parameterized just in case anybody ever wants to
|
||||
change it. Care must be taken if it is increased, because it guards
|
||||
against integer overflow caused by enormously large patterns.])
|
||||
|
||||
AH_VERBATIM([PCRE_EXP_DEFN], [
|
||||
/* If you are compiling for a system other than a Unix-like system or
|
||||
Win32, and it needs some magic to be inserted before the definition
|
||||
of a function that is exported by the library, define this macro to
|
||||
contain the relevant magic. If you do not define this macro, it
|
||||
defaults to "extern" for a C compiler and "extern C" for a C++
|
||||
compiler on non-Win32 systems. This macro apears at the start of
|
||||
every exported function that is part of the external API. It does
|
||||
not appear on functions that are "external" in the C sense, but
|
||||
which are internal to the library. */
|
||||
#undef PCRE_EXP_DEFN])
|
||||
|
||||
if test "$enable_ebcdic" = "yes"; then
|
||||
AC_DEFINE_UNQUOTED([EBCDIC], [], [
|
||||
If you are compiling for a system that uses EBCDIC instead of ASCII
|
||||
character codes, define this macro as 1. On systems that can use
|
||||
"configure", this can be done via --enable-ebcdic. PCRE will then
|
||||
assume that all input strings are in EBCDIC. If you do not define
|
||||
this macro, PCRE will assume input strings are ASCII or UTF-8 Unicode.
|
||||
It is not possible to build a version of PCRE that supports both
|
||||
EBCDIC and UTF-8.])
|
||||
fi
|
||||
|
||||
# Platform specific issues
|
||||
NO_UNDEFINED=
|
||||
EXPORT_ALL_SYMBOLS=
|
||||
case $host_os in
|
||||
cygwin* | mingw* )
|
||||
if test X"$enable_shared" = Xyes; then
|
||||
NO_UNDEFINED="-no-undefined"
|
||||
EXPORT_ALL_SYMBOLS="-Wl,--export-all-symbols"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
# The extra LDFLAGS for each particular library
|
||||
# (Note: The libpcre*_version bits are m4 variables, assigned above)
|
||||
|
||||
EXTRA_LIBPCRE_LDFLAGS="$EXTRA_LIBPCRE_LDFLAGS \
|
||||
$NO_UNDEFINED -version-info libpcre_version"
|
||||
|
||||
EXTRA_LIBPCREPOSIX_LDFLAGS="$EXTRA_LIBPCREPOSIX_LDFLAGS \
|
||||
$NO_UNDEFINED -version-info libpcreposix_version"
|
||||
|
||||
EXTRA_LIBPCRECPP_LDFLAGS="$EXTRA_LIBPCRECPP_LDFLAGS \
|
||||
$NO_UNDEFINED -version-info libpcrecpp_version \
|
||||
$EXPORT_ALL_SYMBOLS"
|
||||
|
||||
AC_SUBST(EXTRA_LIBPCRE_LDFLAGS)
|
||||
AC_SUBST(EXTRA_LIBPCREPOSIX_LDFLAGS)
|
||||
AC_SUBST(EXTRA_LIBPCRECPP_LDFLAGS)
|
||||
|
||||
# When we run 'make distcheck', use these arguments.
|
||||
DISTCHECK_CONFIGURE_FLAGS="--enable-cpp --enable-unicode-properties"
|
||||
AC_SUBST(DISTCHECK_CONFIGURE_FLAGS)
|
||||
|
||||
# Check that, if --enable-pcregrep-libz or --enable-pcregrep-libbz2 is
|
||||
# specified, the relevant library is available.
|
||||
|
||||
if test "$enable_pcregrep_libz" = "yes"; then
|
||||
if test "$HAVE_ZLIB_H" != "1"; then
|
||||
echo "** Cannot --enable-pcregrep-libz because zlib.h was not found"
|
||||
exit 1
|
||||
fi
|
||||
if test "$HAVE_LIBZ" != "1"; then
|
||||
echo "** Cannot --enable-pcregrep-libz because libz was not found"
|
||||
exit 1
|
||||
fi
|
||||
LIBZ="-lz"
|
||||
fi
|
||||
AC_SUBST(LIBZ)
|
||||
|
||||
if test "$enable_pcregrep_libbz2" = "yes"; then
|
||||
if test "$HAVE_BZLIB_H" != "1"; then
|
||||
echo "** Cannot --enable-pcregrep-libbz2 because bzlib.h was not found"
|
||||
exit 1
|
||||
fi
|
||||
if test "$HAVE_LIBBZ2" != "1"; then
|
||||
echo "** Cannot --enable-pcregrep-libbz2 because libbz2 was not found"
|
||||
exit 1
|
||||
fi
|
||||
LIBBZ2="-lbz2"
|
||||
fi
|
||||
AC_SUBST(LIBBZ2)
|
||||
|
||||
# Similarly for --enable-pcretest-readline
|
||||
|
||||
if test "$enable_pcretest_libreadline" = "yes"; then
|
||||
if test "$HAVE_READLINE_H" != "1"; then
|
||||
echo "** Cannot --enable-pcretest-readline because readline/readline.h was not found."
|
||||
exit 1
|
||||
fi
|
||||
if test "$HAVE_HISTORY_H" != "1"; then
|
||||
echo "** Cannot --enable-pcretest-readline because readline/history.h was not found."
|
||||
exit 1
|
||||
fi
|
||||
LIBREADLINE="-lreadline"
|
||||
fi
|
||||
AC_SUBST(LIBREADLINE)
|
||||
|
||||
# Produce these files, in addition to config.h.
|
||||
AC_CONFIG_FILES(
|
||||
Makefile
|
||||
libpcre.pc
|
||||
libpcrecpp.pc
|
||||
pcre-config
|
||||
pcre.h
|
||||
pcre_stringpiece.h
|
||||
pcrecpparg.h
|
||||
)
|
||||
|
||||
# Make the generated script files executable.
|
||||
AC_CONFIG_COMMANDS([script-chmod], [chmod a+x pcre-config])
|
||||
|
||||
# Make sure that pcre_chartables.c is removed in case the method for
|
||||
# creating it was changed by reconfiguration.
|
||||
AC_CONFIG_COMMANDS([delete-old-chartables], [rm -f pcre_chartables.c])
|
||||
|
||||
AC_OUTPUT
|
||||
|
||||
# Print out a nice little message after configure is run displaying your
|
||||
# chosen options.
|
||||
|
||||
cat <<EOF
|
||||
|
||||
$PACKAGE-$VERSION configuration summary:
|
||||
|
||||
Install prefix .................. : ${prefix}
|
||||
C preprocessor .................. : ${CPP}
|
||||
C compiler ...................... : ${CC}
|
||||
C++ preprocessor ................ : ${CXXCPP}
|
||||
C++ compiler .................... : ${CXX}
|
||||
Linker .......................... : ${LD}
|
||||
C preprocessor flags ............ : ${CPPFLAGS}
|
||||
C compiler flags ................ : ${CFLAGS}
|
||||
C++ compiler flags .............. : ${CXXFLAGS}
|
||||
Linker flags .................... : ${LDFLAGS}
|
||||
Extra libraries ................. : ${LIBS}
|
||||
|
||||
Build C++ library ............... : ${enable_cpp}
|
||||
Enable UTF-8 support ............ : ${enable_utf8}
|
||||
Unicode properties .............. : ${enable_unicode_properties}
|
||||
Newline char/sequence ........... : ${enable_newline}
|
||||
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
|
||||
EBCDIC coding ................... : ${enable_ebcdic}
|
||||
Rebuild char tables ............. : ${enable_rebuild_chartables}
|
||||
Use stack recursion ............. : ${enable_stack_for_recursion}
|
||||
POSIX mem threshold ............. : ${with_posix_malloc_threshold}
|
||||
Internal link size .............. : ${with_link_size}
|
||||
Match limit ..................... : ${with_match_limit}
|
||||
Match limit recursion ........... : ${with_match_limit_recursion}
|
||||
Build shared libs ............... : ${enable_shared}
|
||||
Build static libs ............... : ${enable_static}
|
||||
Link pcregrep with libz ......... : ${enable_pcregrep_libz}
|
||||
Link pcregrep with libbz2 ....... : ${enable_pcregrep_libbz2}
|
||||
Link pcretest with libreadline .. : ${enable_pcretest_libreadline}
|
||||
|
||||
EOF
|
||||
|
||||
dnl end configure.ac
|
||||
|
||||
Executable
+589
@@ -0,0 +1,589 @@
|
||||
#! /bin/sh
|
||||
# depcomp - compile a program generating dependencies as side-effects
|
||||
|
||||
scriptversion=2007-03-29.01
|
||||
|
||||
# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2006, 2007 Free Software
|
||||
# Foundation, Inc.
|
||||
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2, or (at your option)
|
||||
# any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||
# 02110-1301, USA.
|
||||
|
||||
# As a special exception to the GNU General Public License, if you
|
||||
# distribute this file as part of a program that contains a
|
||||
# configuration script generated by Autoconf, you may include it under
|
||||
# the same distribution terms that you use for the rest of that program.
|
||||
|
||||
# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>.
|
||||
|
||||
case $1 in
|
||||
'')
|
||||
echo "$0: No command. Try \`$0 --help' for more information." 1>&2
|
||||
exit 1;
|
||||
;;
|
||||
-h | --h*)
|
||||
cat <<\EOF
|
||||
Usage: depcomp [--help] [--version] PROGRAM [ARGS]
|
||||
|
||||
Run PROGRAMS ARGS to compile a file, generating dependencies
|
||||
as side-effects.
|
||||
|
||||
Environment variables:
|
||||
depmode Dependency tracking mode.
|
||||
source Source file read by `PROGRAMS ARGS'.
|
||||
object Object file output by `PROGRAMS ARGS'.
|
||||
DEPDIR directory where to store dependencies.
|
||||
depfile Dependency file to output.
|
||||
tmpdepfile Temporary file to use when outputing dependencies.
|
||||
libtool Whether libtool is used (yes/no).
|
||||
|
||||
Report bugs to <bug-automake@gnu.org>.
|
||||
EOF
|
||||
exit $?
|
||||
;;
|
||||
-v | --v*)
|
||||
echo "depcomp $scriptversion"
|
||||
exit $?
|
||||
;;
|
||||
esac
|
||||
|
||||
if test -z "$depmode" || test -z "$source" || test -z "$object"; then
|
||||
echo "depcomp: Variables source, object and depmode must be set" 1>&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po.
|
||||
depfile=${depfile-`echo "$object" |
|
||||
sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`}
|
||||
tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`}
|
||||
|
||||
rm -f "$tmpdepfile"
|
||||
|
||||
# Some modes work just like other modes, but use different flags. We
|
||||
# parameterize here, but still list the modes in the big case below,
|
||||
# to make depend.m4 easier to write. Note that we *cannot* use a case
|
||||
# here, because this file can only contain one case statement.
|
||||
if test "$depmode" = hp; then
|
||||
# HP compiler uses -M and no extra arg.
|
||||
gccflag=-M
|
||||
depmode=gcc
|
||||
fi
|
||||
|
||||
if test "$depmode" = dashXmstdout; then
|
||||
# This is just like dashmstdout with a different argument.
|
||||
dashmflag=-xM
|
||||
depmode=dashmstdout
|
||||
fi
|
||||
|
||||
case "$depmode" in
|
||||
gcc3)
|
||||
## gcc 3 implements dependency tracking that does exactly what
|
||||
## we want. Yay! Note: for some reason libtool 1.4 doesn't like
|
||||
## it if -MD -MP comes after the -MF stuff. Hmm.
|
||||
## Unfortunately, FreeBSD c89 acceptance of flags depends upon
|
||||
## the command line argument order; so add the flags where they
|
||||
## appear in depend2.am. Note that the slowdown incurred here
|
||||
## affects only configure: in makefiles, %FASTDEP% shortcuts this.
|
||||
for arg
|
||||
do
|
||||
case $arg in
|
||||
-c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;;
|
||||
*) set fnord "$@" "$arg" ;;
|
||||
esac
|
||||
shift # fnord
|
||||
shift # $arg
|
||||
done
|
||||
"$@"
|
||||
stat=$?
|
||||
if test $stat -eq 0; then :
|
||||
else
|
||||
rm -f "$tmpdepfile"
|
||||
exit $stat
|
||||
fi
|
||||
mv "$tmpdepfile" "$depfile"
|
||||
;;
|
||||
|
||||
gcc)
|
||||
## There are various ways to get dependency output from gcc. Here's
|
||||
## why we pick this rather obscure method:
|
||||
## - Don't want to use -MD because we'd like the dependencies to end
|
||||
## up in a subdir. Having to rename by hand is ugly.
|
||||
## (We might end up doing this anyway to support other compilers.)
|
||||
## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like
|
||||
## -MM, not -M (despite what the docs say).
|
||||
## - Using -M directly means running the compiler twice (even worse
|
||||
## than renaming).
|
||||
if test -z "$gccflag"; then
|
||||
gccflag=-MD,
|
||||
fi
|
||||
"$@" -Wp,"$gccflag$tmpdepfile"
|
||||
stat=$?
|
||||
if test $stat -eq 0; then :
|
||||
else
|
||||
rm -f "$tmpdepfile"
|
||||
exit $stat
|
||||
fi
|
||||
rm -f "$depfile"
|
||||
echo "$object : \\" > "$depfile"
|
||||
alpha=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
|
||||
## The second -e expression handles DOS-style file names with drive letters.
|
||||
sed -e 's/^[^:]*: / /' \
|
||||
-e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile"
|
||||
## This next piece of magic avoids the `deleted header file' problem.
|
||||
## The problem is that when a header file which appears in a .P file
|
||||
## is deleted, the dependency causes make to die (because there is
|
||||
## typically no way to rebuild the header). We avoid this by adding
|
||||
## dummy dependencies for each header file. Too bad gcc doesn't do
|
||||
## this for us directly.
|
||||
tr ' ' '
|
||||
' < "$tmpdepfile" |
|
||||
## Some versions of gcc put a space before the `:'. On the theory
|
||||
## that the space means something, we add a space to the output as
|
||||
## well.
|
||||
## Some versions of the HPUX 10.20 sed can't process this invocation
|
||||
## correctly. Breaking it into two sed invocations is a workaround.
|
||||
sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
|
||||
rm -f "$tmpdepfile"
|
||||
;;
|
||||
|
||||
hp)
|
||||
# This case exists only to let depend.m4 do its work. It works by
|
||||
# looking at the text of this script. This case will never be run,
|
||||
# since it is checked for above.
|
||||
exit 1
|
||||
;;
|
||||
|
||||
sgi)
|
||||
if test "$libtool" = yes; then
|
||||
"$@" "-Wp,-MDupdate,$tmpdepfile"
|
||||
else
|
||||
"$@" -MDupdate "$tmpdepfile"
|
||||
fi
|
||||
stat=$?
|
||||
if test $stat -eq 0; then :
|
||||
else
|
||||
rm -f "$tmpdepfile"
|
||||
exit $stat
|
||||
fi
|
||||
rm -f "$depfile"
|
||||
|
||||
if test -f "$tmpdepfile"; then # yes, the sourcefile depend on other files
|
||||
echo "$object : \\" > "$depfile"
|
||||
|
||||
# Clip off the initial element (the dependent). Don't try to be
|
||||
# clever and replace this with sed code, as IRIX sed won't handle
|
||||
# lines with more than a fixed number of characters (4096 in
|
||||
# IRIX 6.2 sed, 8192 in IRIX 6.5). We also remove comment lines;
|
||||
# the IRIX cc adds comments like `#:fec' to the end of the
|
||||
# dependency line.
|
||||
tr ' ' '
|
||||
' < "$tmpdepfile" \
|
||||
| sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' | \
|
||||
tr '
|
||||
' ' ' >> $depfile
|
||||
echo >> $depfile
|
||||
|
||||
# The second pass generates a dummy entry for each header file.
|
||||
tr ' ' '
|
||||
' < "$tmpdepfile" \
|
||||
| sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \
|
||||
>> $depfile
|
||||
else
|
||||
# The sourcefile does not contain any dependencies, so just
|
||||
# store a dummy comment line, to avoid errors with the Makefile
|
||||
# "include basename.Plo" scheme.
|
||||
echo "#dummy" > "$depfile"
|
||||
fi
|
||||
rm -f "$tmpdepfile"
|
||||
;;
|
||||
|
||||
aix)
|
||||
# The C for AIX Compiler uses -M and outputs the dependencies
|
||||
# in a .u file. In older versions, this file always lives in the
|
||||
# current directory. Also, the AIX compiler puts `$object:' at the
|
||||
# start of each line; $object doesn't have directory information.
|
||||
# Version 6 uses the directory in both cases.
|
||||
dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
|
||||
test "x$dir" = "x$object" && dir=
|
||||
base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
|
||||
if test "$libtool" = yes; then
|
||||
tmpdepfile1=$dir$base.u
|
||||
tmpdepfile2=$base.u
|
||||
tmpdepfile3=$dir.libs/$base.u
|
||||
"$@" -Wc,-M
|
||||
else
|
||||
tmpdepfile1=$dir$base.u
|
||||
tmpdepfile2=$dir$base.u
|
||||
tmpdepfile3=$dir$base.u
|
||||
"$@" -M
|
||||
fi
|
||||
stat=$?
|
||||
|
||||
if test $stat -eq 0; then :
|
||||
else
|
||||
rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
|
||||
exit $stat
|
||||
fi
|
||||
|
||||
for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
|
||||
do
|
||||
test -f "$tmpdepfile" && break
|
||||
done
|
||||
if test -f "$tmpdepfile"; then
|
||||
# Each line is of the form `foo.o: dependent.h'.
|
||||
# Do two passes, one to just change these to
|
||||
# `$object: dependent.h' and one to simply `dependent.h:'.
|
||||
sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
|
||||
# That's a tab and a space in the [].
|
||||
sed -e 's,^.*\.[a-z]*:[ ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
|
||||
else
|
||||
# The sourcefile does not contain any dependencies, so just
|
||||
# store a dummy comment line, to avoid errors with the Makefile
|
||||
# "include basename.Plo" scheme.
|
||||
echo "#dummy" > "$depfile"
|
||||
fi
|
||||
rm -f "$tmpdepfile"
|
||||
;;
|
||||
|
||||
icc)
|
||||
# Intel's C compiler understands `-MD -MF file'. However on
|
||||
# icc -MD -MF foo.d -c -o sub/foo.o sub/foo.c
|
||||
# ICC 7.0 will fill foo.d with something like
|
||||
# foo.o: sub/foo.c
|
||||
# foo.o: sub/foo.h
|
||||
# which is wrong. We want:
|
||||
# sub/foo.o: sub/foo.c
|
||||
# sub/foo.o: sub/foo.h
|
||||
# sub/foo.c:
|
||||
# sub/foo.h:
|
||||
# ICC 7.1 will output
|
||||
# foo.o: sub/foo.c sub/foo.h
|
||||
# and will wrap long lines using \ :
|
||||
# foo.o: sub/foo.c ... \
|
||||
# sub/foo.h ... \
|
||||
# ...
|
||||
|
||||
"$@" -MD -MF "$tmpdepfile"
|
||||
stat=$?
|
||||
if test $stat -eq 0; then :
|
||||
else
|
||||
rm -f "$tmpdepfile"
|
||||
exit $stat
|
||||
fi
|
||||
rm -f "$depfile"
|
||||
# Each line is of the form `foo.o: dependent.h',
|
||||
# or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'.
|
||||
# Do two passes, one to just change these to
|
||||
# `$object: dependent.h' and one to simply `dependent.h:'.
|
||||
sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile"
|
||||
# Some versions of the HPUX 10.20 sed can't process this invocation
|
||||
# correctly. Breaking it into two sed invocations is a workaround.
|
||||
sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" |
|
||||
sed -e 's/$/ :/' >> "$depfile"
|
||||
rm -f "$tmpdepfile"
|
||||
;;
|
||||
|
||||
hp2)
|
||||
# The "hp" stanza above does not work with aCC (C++) and HP's ia64
|
||||
# compilers, which have integrated preprocessors. The correct option
|
||||
# to use with these is +Maked; it writes dependencies to a file named
|
||||
# 'foo.d', which lands next to the object file, wherever that
|
||||
# happens to be.
|
||||
# Much of this is similar to the tru64 case; see comments there.
|
||||
dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
|
||||
test "x$dir" = "x$object" && dir=
|
||||
base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
|
||||
if test "$libtool" = yes; then
|
||||
tmpdepfile1=$dir$base.d
|
||||
tmpdepfile2=$dir.libs/$base.d
|
||||
"$@" -Wc,+Maked
|
||||
else
|
||||
tmpdepfile1=$dir$base.d
|
||||
tmpdepfile2=$dir$base.d
|
||||
"$@" +Maked
|
||||
fi
|
||||
stat=$?
|
||||
if test $stat -eq 0; then :
|
||||
else
|
||||
rm -f "$tmpdepfile1" "$tmpdepfile2"
|
||||
exit $stat
|
||||
fi
|
||||
|
||||
for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2"
|
||||
do
|
||||
test -f "$tmpdepfile" && break
|
||||
done
|
||||
if test -f "$tmpdepfile"; then
|
||||
sed -e "s,^.*\.[a-z]*:,$object:," "$tmpdepfile" > "$depfile"
|
||||
# Add `dependent.h:' lines.
|
||||
sed -ne '2,${; s/^ *//; s/ \\*$//; s/$/:/; p;}' "$tmpdepfile" >> "$depfile"
|
||||
else
|
||||
echo "#dummy" > "$depfile"
|
||||
fi
|
||||
rm -f "$tmpdepfile" "$tmpdepfile2"
|
||||
;;
|
||||
|
||||
tru64)
|
||||
# The Tru64 compiler uses -MD to generate dependencies as a side
|
||||
# effect. `cc -MD -o foo.o ...' puts the dependencies into `foo.o.d'.
|
||||
# At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put
|
||||
# dependencies in `foo.d' instead, so we check for that too.
|
||||
# Subdirectories are respected.
|
||||
dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
|
||||
test "x$dir" = "x$object" && dir=
|
||||
base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
|
||||
|
||||
if test "$libtool" = yes; then
|
||||
# With Tru64 cc, shared objects can also be used to make a
|
||||
# static library. This mechanism is used in libtool 1.4 series to
|
||||
# handle both shared and static libraries in a single compilation.
|
||||
# With libtool 1.4, dependencies were output in $dir.libs/$base.lo.d.
|
||||
#
|
||||
# With libtool 1.5 this exception was removed, and libtool now
|
||||
# generates 2 separate objects for the 2 libraries. These two
|
||||
# compilations output dependencies in $dir.libs/$base.o.d and
|
||||
# in $dir$base.o.d. We have to check for both files, because
|
||||
# one of the two compilations can be disabled. We should prefer
|
||||
# $dir$base.o.d over $dir.libs/$base.o.d because the latter is
|
||||
# automatically cleaned when .libs/ is deleted, while ignoring
|
||||
# the former would cause a distcleancheck panic.
|
||||
tmpdepfile1=$dir.libs/$base.lo.d # libtool 1.4
|
||||
tmpdepfile2=$dir$base.o.d # libtool 1.5
|
||||
tmpdepfile3=$dir.libs/$base.o.d # libtool 1.5
|
||||
tmpdepfile4=$dir.libs/$base.d # Compaq CCC V6.2-504
|
||||
"$@" -Wc,-MD
|
||||
else
|
||||
tmpdepfile1=$dir$base.o.d
|
||||
tmpdepfile2=$dir$base.d
|
||||
tmpdepfile3=$dir$base.d
|
||||
tmpdepfile4=$dir$base.d
|
||||
"$@" -MD
|
||||
fi
|
||||
|
||||
stat=$?
|
||||
if test $stat -eq 0; then :
|
||||
else
|
||||
rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
|
||||
exit $stat
|
||||
fi
|
||||
|
||||
for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
|
||||
do
|
||||
test -f "$tmpdepfile" && break
|
||||
done
|
||||
if test -f "$tmpdepfile"; then
|
||||
sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
|
||||
# That's a tab and a space in the [].
|
||||
sed -e 's,^.*\.[a-z]*:[ ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
|
||||
else
|
||||
echo "#dummy" > "$depfile"
|
||||
fi
|
||||
rm -f "$tmpdepfile"
|
||||
;;
|
||||
|
||||
#nosideeffect)
|
||||
# This comment above is used by automake to tell side-effect
|
||||
# dependency tracking mechanisms from slower ones.
|
||||
|
||||
dashmstdout)
|
||||
# Important note: in order to support this mode, a compiler *must*
|
||||
# always write the preprocessed file to stdout, regardless of -o.
|
||||
"$@" || exit $?
|
||||
|
||||
# Remove the call to Libtool.
|
||||
if test "$libtool" = yes; then
|
||||
while test $1 != '--mode=compile'; do
|
||||
shift
|
||||
done
|
||||
shift
|
||||
fi
|
||||
|
||||
# Remove `-o $object'.
|
||||
IFS=" "
|
||||
for arg
|
||||
do
|
||||
case $arg in
|
||||
-o)
|
||||
shift
|
||||
;;
|
||||
$object)
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
set fnord "$@" "$arg"
|
||||
shift # fnord
|
||||
shift # $arg
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
test -z "$dashmflag" && dashmflag=-M
|
||||
# Require at least two characters before searching for `:'
|
||||
# in the target name. This is to cope with DOS-style filenames:
|
||||
# a dependency such as `c:/foo/bar' could be seen as target `c' otherwise.
|
||||
"$@" $dashmflag |
|
||||
sed 's:^[ ]*[^: ][^:][^:]*\:[ ]*:'"$object"'\: :' > "$tmpdepfile"
|
||||
rm -f "$depfile"
|
||||
cat < "$tmpdepfile" > "$depfile"
|
||||
tr ' ' '
|
||||
' < "$tmpdepfile" | \
|
||||
## Some versions of the HPUX 10.20 sed can't process this invocation
|
||||
## correctly. Breaking it into two sed invocations is a workaround.
|
||||
sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
|
||||
rm -f "$tmpdepfile"
|
||||
;;
|
||||
|
||||
dashXmstdout)
|
||||
# This case only exists to satisfy depend.m4. It is never actually
|
||||
# run, as this mode is specially recognized in the preamble.
|
||||
exit 1
|
||||
;;
|
||||
|
||||
makedepend)
|
||||
"$@" || exit $?
|
||||
# Remove any Libtool call
|
||||
if test "$libtool" = yes; then
|
||||
while test $1 != '--mode=compile'; do
|
||||
shift
|
||||
done
|
||||
shift
|
||||
fi
|
||||
# X makedepend
|
||||
shift
|
||||
cleared=no
|
||||
for arg in "$@"; do
|
||||
case $cleared in
|
||||
no)
|
||||
set ""; shift
|
||||
cleared=yes ;;
|
||||
esac
|
||||
case "$arg" in
|
||||
-D*|-I*)
|
||||
set fnord "$@" "$arg"; shift ;;
|
||||
# Strip any option that makedepend may not understand. Remove
|
||||
# the object too, otherwise makedepend will parse it as a source file.
|
||||
-*|$object)
|
||||
;;
|
||||
*)
|
||||
set fnord "$@" "$arg"; shift ;;
|
||||
esac
|
||||
done
|
||||
obj_suffix="`echo $object | sed 's/^.*\././'`"
|
||||
touch "$tmpdepfile"
|
||||
${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@"
|
||||
rm -f "$depfile"
|
||||
cat < "$tmpdepfile" > "$depfile"
|
||||
sed '1,2d' "$tmpdepfile" | tr ' ' '
|
||||
' | \
|
||||
## Some versions of the HPUX 10.20 sed can't process this invocation
|
||||
## correctly. Breaking it into two sed invocations is a workaround.
|
||||
sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
|
||||
rm -f "$tmpdepfile" "$tmpdepfile".bak
|
||||
;;
|
||||
|
||||
cpp)
|
||||
# Important note: in order to support this mode, a compiler *must*
|
||||
# always write the preprocessed file to stdout.
|
||||
"$@" || exit $?
|
||||
|
||||
# Remove the call to Libtool.
|
||||
if test "$libtool" = yes; then
|
||||
while test $1 != '--mode=compile'; do
|
||||
shift
|
||||
done
|
||||
shift
|
||||
fi
|
||||
|
||||
# Remove `-o $object'.
|
||||
IFS=" "
|
||||
for arg
|
||||
do
|
||||
case $arg in
|
||||
-o)
|
||||
shift
|
||||
;;
|
||||
$object)
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
set fnord "$@" "$arg"
|
||||
shift # fnord
|
||||
shift # $arg
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
"$@" -E |
|
||||
sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
|
||||
-e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' |
|
||||
sed '$ s: \\$::' > "$tmpdepfile"
|
||||
rm -f "$depfile"
|
||||
echo "$object : \\" > "$depfile"
|
||||
cat < "$tmpdepfile" >> "$depfile"
|
||||
sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile"
|
||||
rm -f "$tmpdepfile"
|
||||
;;
|
||||
|
||||
msvisualcpp)
|
||||
# Important note: in order to support this mode, a compiler *must*
|
||||
# always write the preprocessed file to stdout, regardless of -o,
|
||||
# because we must use -o when running libtool.
|
||||
"$@" || exit $?
|
||||
IFS=" "
|
||||
for arg
|
||||
do
|
||||
case "$arg" in
|
||||
"-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI")
|
||||
set fnord "$@"
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
set fnord "$@" "$arg"
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
"$@" -E |
|
||||
sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::echo "`cygpath -u \\"\1\\"`":p' | sort | uniq > "$tmpdepfile"
|
||||
rm -f "$depfile"
|
||||
echo "$object : \\" > "$depfile"
|
||||
. "$tmpdepfile" | sed 's% %\\ %g' | sed -n '/^\(.*\)$/ s:: \1 \\:p' >> "$depfile"
|
||||
echo " " >> "$depfile"
|
||||
. "$tmpdepfile" | sed 's% %\\ %g' | sed -n '/^\(.*\)$/ s::\1\::p' >> "$depfile"
|
||||
rm -f "$tmpdepfile"
|
||||
;;
|
||||
|
||||
none)
|
||||
exec "$@"
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "Unknown depmode $depmode" 1>&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
exit 0
|
||||
|
||||
# Local Variables:
|
||||
# mode: shell-script
|
||||
# sh-indentation: 2
|
||||
# eval: (add-hook 'write-file-hooks 'time-stamp)
|
||||
# time-stamp-start: "scriptversion="
|
||||
# time-stamp-format: "%:y-%02m-%02d.%02H"
|
||||
# time-stamp-end: "$"
|
||||
# End:
|
||||
+44
-17
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2006 University of Cambridge
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -38,14 +38,19 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
/* This is a freestanding support program to generate a file containing default
|
||||
character tables for PCRE. The tables are built according to the default C
|
||||
/* This is a freestanding support program to generate a file containing
|
||||
character tables for PCRE. The tables are built according to the current
|
||||
locale. Now that pcre_maketables is a function visible to the outside world, we
|
||||
make use of its code from here in order to be consistent. */
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <locale.h>
|
||||
|
||||
#include "pcre_internal.h"
|
||||
|
||||
@@ -55,38 +60,60 @@ make use of its code from here in order to be consistent. */
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int i;
|
||||
FILE *f;
|
||||
const unsigned char *tables = pcre_maketables();
|
||||
const unsigned char *base_of_tables = tables;
|
||||
int i = 1;
|
||||
const unsigned char *tables;
|
||||
const unsigned char *base_of_tables;
|
||||
|
||||
if (argc != 2)
|
||||
/* By default, the default C locale is used rather than what the building user
|
||||
happens to have set. However, if the -L option is given, set the locale from
|
||||
the LC_xxx environment variables. */
|
||||
|
||||
if (argc > 1 && strcmp(argv[1], "-L") == 0)
|
||||
{
|
||||
setlocale(LC_ALL, ""); /* Set from environment variables */
|
||||
i++;
|
||||
}
|
||||
|
||||
if (argc < i + 1)
|
||||
{
|
||||
fprintf(stderr, "dftables: one filename argument is required\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
f = fopen(argv[1], "wb");
|
||||
tables = pcre_maketables();
|
||||
base_of_tables = tables;
|
||||
|
||||
f = fopen(argv[i], "wb");
|
||||
if (f == NULL)
|
||||
{
|
||||
fprintf(stderr, "dftables: failed to open %s for writing\n", argv[1]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* There are two fprintf() calls here, because gcc in pedantic mode complains
|
||||
about the very long string otherwise. */
|
||||
/* There are several fprintf() calls here, because gcc in pedantic mode
|
||||
complains about the very long string otherwise. */
|
||||
|
||||
fprintf(f,
|
||||
"/*************************************************\n"
|
||||
"* Perl-Compatible Regular Expressions *\n"
|
||||
"*************************************************/\n\n"
|
||||
"/* This file is automatically written by the dftables auxiliary \n"
|
||||
"program. If you edit it by hand, you might like to edit the Makefile to \n"
|
||||
"prevent its ever being regenerated.\n\n");
|
||||
"/* This file was automatically written by the dftables auxiliary\n"
|
||||
"program. It contains character tables that are used when no external\n"
|
||||
"tables are passed to PCRE by the application that calls it. The tables\n"
|
||||
"are used only for characters whose code values are less than 256.\n\n");
|
||||
fprintf(f,
|
||||
"The following #includes are present because without them gcc 4.x may remove\n"
|
||||
"the array definition from the final binary if PCRE is built into a static\n"
|
||||
"library and dead code stripping is activated. This leads to link errors.\n"
|
||||
"Pulling in the header ensures that the array gets flagged as \"someone\n"
|
||||
"outside this compilation unit might reference this\" and so it will always\n"
|
||||
"be supplied to the linker. */\n\n"
|
||||
"#ifdef HAVE_CONFIG_H\n"
|
||||
"#include \"config.h\"\n"
|
||||
"#endif\n\n"
|
||||
"#include \"pcre_internal.h\"\n\n");
|
||||
fprintf(f,
|
||||
"This file contains the default tables for characters with codes less than\n"
|
||||
"128 (ASCII characters). These tables are used when no external tables are\n"
|
||||
"passed to PCRE. */\n\n"
|
||||
"const unsigned char _pcre_default_tables[] = {\n\n"
|
||||
"/* This table is a lower casing table. */\n\n");
|
||||
|
||||
@@ -162,7 +189,7 @@ if (isprint(i-8)) fprintf(f, " %c -", i-8);
|
||||
else fprintf(f, "%3d-", i-8);
|
||||
if (isprint(i-1)) fprintf(f, " %c ", i-1);
|
||||
else fprintf(f, "%3d", i-1);
|
||||
fprintf(f, " */\n\n/* End of chartables.c */\n");
|
||||
fprintf(f, " */\n\n/* End of pcre_chartables.c */\n");
|
||||
|
||||
fclose(f);
|
||||
free((void *)base_of_tables);
|
||||
|
||||
@@ -1,4 +1,10 @@
|
||||
<html>
|
||||
<!-- This is a manually maintained file that is the root of the HTML version of
|
||||
the PCRE documentation. When the HTML documents are built from the man
|
||||
page versions, the entire doc/html directory is emptied, this file is then
|
||||
copied into doc/html/index.html, and the remaining files therein are
|
||||
created by the 132html script.
|
||||
-->
|
||||
<head>
|
||||
<title>PCRE specification</title>
|
||||
</head>
|
||||
@@ -12,6 +18,9 @@ The HTML documentation for PCRE comprises the following pages:
|
||||
<tr><td><a href="pcre.html">pcre</a></td>
|
||||
<td> Introductory page</td></tr>
|
||||
|
||||
<tr><td><a href="pcre-config.html">pcre-config</a></td>
|
||||
<td> Information about the installation configuration</td></tr>
|
||||
|
||||
<tr><td><a href="pcreapi.html">pcreapi</a></td>
|
||||
<td> PCRE's native API</td></tr>
|
||||
|
||||
@@ -54,6 +63,9 @@ The HTML documentation for PCRE comprises the following pages:
|
||||
<tr><td><a href="pcrestack.html">pcrestack</a></td>
|
||||
<td> Discussion of PCRE's stack usage</td></tr>
|
||||
|
||||
<tr><td><a href="pcresyntax.html">pcresyntax</a></td>
|
||||
<td> Syntax quick-reference summary</td></tr>
|
||||
|
||||
<tr><td><a href="pcretest.html">pcretest</a></td>
|
||||
<td> The <b>pcretest</b> command for testing PCRE</td></tr>
|
||||
</table>
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
<html>
|
||||
<head>
|
||||
<title>pcre-config specification</title>
|
||||
</head>
|
||||
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||
<h1>pcre-config man page</h1>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
<p>
|
||||
This page is part of the PCRE HTML documentation. It was generated automatically
|
||||
from the original man page. If there is any nonsense in it, please consult the
|
||||
man page, in case the conversion went wrong.
|
||||
<br>
|
||||
<ul>
|
||||
<li><a name="TOC1" href="#SEC1">SYNOPSIS</a>
|
||||
<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
|
||||
<li><a name="TOC3" href="#SEC3">OPTIONS</a>
|
||||
<li><a name="TOC4" href="#SEC4">SEE ALSO</a>
|
||||
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
|
||||
<li><a name="TOC6" href="#SEC6">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
|
||||
<P>
|
||||
<b>pcre-config [--prefix] [--exec-prefix] [--version] [--libs]</b>
|
||||
<b>[--libs-posix] [--cflags] [--cflags-posix]</b>
|
||||
</P>
|
||||
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
|
||||
<P>
|
||||
<b>pcre-config</b> returns the configuration of the installed PCRE
|
||||
libraries and the options required to compile a program to use them.
|
||||
</P>
|
||||
<br><a name="SEC3" href="#TOC1">OPTIONS</a><br>
|
||||
<P>
|
||||
<b>--prefix</b>
|
||||
Writes the directory prefix used in the PCRE installation for architecture
|
||||
independent files (<i>/usr</i> on many systems, <i>/usr/local</i> on some
|
||||
systems) to the standard output.
|
||||
</P>
|
||||
<P>
|
||||
<b>--exec-prefix</b>
|
||||
Writes the directory prefix used in the PCRE installation for architecture
|
||||
dependent files (normally the same as <b>--prefix</b>) to the standard output.
|
||||
</P>
|
||||
<P>
|
||||
<b>--version</b>
|
||||
Writes the version number of the installed PCRE libraries to the standard
|
||||
output.
|
||||
</P>
|
||||
<P>
|
||||
<b>--libs</b>
|
||||
Writes to the standard output the command line options required to link
|
||||
with PCRE (<b>-lpcre</b> on many systems).
|
||||
</P>
|
||||
<P>
|
||||
<b>--libs-posix</b>
|
||||
Writes to the standard output the command line options required to link with
|
||||
the PCRE posix emulation library (<b>-lpcreposix</b> <b>-lpcre</b> on many
|
||||
systems).
|
||||
</P>
|
||||
<P>
|
||||
<b>--cflags</b>
|
||||
Writes to the standard output the command line options required to compile
|
||||
files that use PCRE (this may include some <b>-I</b> options, but is blank on
|
||||
many systems).
|
||||
</P>
|
||||
<P>
|
||||
<b>--cflags-posix</b>
|
||||
Writes to the standard output the command line options required to compile
|
||||
files that use the PCRE posix emulation library (this may include some <b>-I</b>
|
||||
options, but is blank on many systems).
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcre(3)</b>
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
This manual page was originally written by Mark Baker for the Debian GNU/Linux
|
||||
system. It has been slightly revised as a generic PCRE man page.
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 18 April 2007
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
@@ -18,18 +18,26 @@ man page, in case the conversion went wrong.
|
||||
<li><a name="TOC3" href="#SEC3">LIMITATIONS</a>
|
||||
<li><a name="TOC4" href="#SEC4">UTF-8 AND UNICODE PROPERTY SUPPORT</a>
|
||||
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
|
||||
<li><a name="TOC6" href="#SEC6">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">INTRODUCTION</a><br>
|
||||
<P>
|
||||
The PCRE library is a set of functions that implement regular expression
|
||||
pattern matching using the same syntax and semantics as Perl, with just a few
|
||||
differences. The current implementation of PCRE (release 6.x) corresponds
|
||||
approximately with Perl 5.8, including support for UTF-8 encoded strings and
|
||||
Unicode general category properties. However, this support has to be explicitly
|
||||
enabled; it is not the default.
|
||||
differences. Certain features that appeared in Python and PCRE before they
|
||||
appeared in Perl are also available using the Python syntax. There is also some
|
||||
support for certain .NET and Oniguruma syntax items, and there is an option for
|
||||
requesting some minor changes that give better JavaScript compatibility.
|
||||
</P>
|
||||
<P>
|
||||
In addition to the Perl-compatible matching function, PCRE also contains an
|
||||
The current implementation of PCRE (release 7.x) corresponds approximately with
|
||||
Perl 5.10, including support for UTF-8 encoded strings and Unicode general
|
||||
category properties. However, UTF-8 and Unicode support has to be explicitly
|
||||
enabled; it is not the default. The Unicode tables correspond to Unicode
|
||||
release 5.1.
|
||||
</P>
|
||||
<P>
|
||||
In addition to the Perl-compatible matching function, PCRE contains an
|
||||
alternative matching function that matches the same compiled patterns in a
|
||||
different way. In certain circumstances, the alternative function has some
|
||||
advantages. For a discussion of the two matching algorithms, see the
|
||||
@@ -52,7 +60,9 @@ supported by PCRE are given in separate documents. See the
|
||||
<a href="pcrepattern.html"><b>pcrepattern</b></a>
|
||||
and
|
||||
<a href="pcrecompat.html"><b>pcrecompat</b></a>
|
||||
pages.
|
||||
pages. There is a syntax summary in the
|
||||
<a href="pcresyntax.html"><b>pcresyntax</b></a>
|
||||
page.
|
||||
</P>
|
||||
<P>
|
||||
Some features of PCRE can be included, excluded, or changed when the library is
|
||||
@@ -82,6 +92,7 @@ all the sections are concatenated, for ease of searching. The sections are as
|
||||
follows:
|
||||
<pre>
|
||||
pcre this document
|
||||
pcre-config show PCRE installation configuration information
|
||||
pcreapi details of PCRE's native C API
|
||||
pcrebuild options for building PCRE
|
||||
pcrecallout details of the callout feature
|
||||
@@ -91,6 +102,7 @@ follows:
|
||||
pcrematching discussion of the two matching algorithms
|
||||
pcrepartial details of the partial matching facility
|
||||
pcrepattern syntax and semantics of supported regular expressions
|
||||
pcresyntax quick syntax reference
|
||||
pcreperform discussion of performance issues
|
||||
pcreposix the POSIX-compatible C API
|
||||
pcreprecompile details of saving and re-using precompiled patterns
|
||||
@@ -114,21 +126,18 @@ internal linkage size of 3 or 4 (see the <b>README</b> file in the source
|
||||
distribution and the
|
||||
<a href="pcrebuild.html"><b>pcrebuild</b></a>
|
||||
documentation for details). In these cases the limit is substantially larger.
|
||||
However, the speed of execution will be slower.
|
||||
However, the speed of execution is slower.
|
||||
</P>
|
||||
<P>
|
||||
All values in repeating quantifiers must be less than 65536. The maximum
|
||||
compiled length of subpattern with an explicit repeat count is 30000 bytes. The
|
||||
maximum number of capturing subpatterns is 65535.
|
||||
All values in repeating quantifiers must be less than 65536.
|
||||
</P>
|
||||
<P>
|
||||
There is no limit to the number of non-capturing subpatterns, but the maximum
|
||||
depth of nesting of all kinds of parenthesized subpattern, including capturing
|
||||
subpatterns, assertions, and other types of subpattern, is 200.
|
||||
There is no limit to the number of parenthesized subpatterns, but there can be
|
||||
no more than 65535 capturing subpatterns.
|
||||
</P>
|
||||
<P>
|
||||
The maximum length of name for a named subpattern is 32, and the maximum number
|
||||
of named subpatterns is 10000.
|
||||
The maximum length of name for a named subpattern is 32 characters, and the
|
||||
maximum number of named subpatterns is 10000.
|
||||
</P>
|
||||
<P>
|
||||
The maximum length of a subject string is the largest positive number that an
|
||||
@@ -151,14 +160,15 @@ category properties was added.
|
||||
In order process UTF-8 strings, you must build PCRE to include UTF-8 support in
|
||||
the code, and, in addition, you must call
|
||||
<a href="pcre_compile.html"><b>pcre_compile()</b></a>
|
||||
with the PCRE_UTF8 option flag. When you do this, both the pattern and any
|
||||
subject strings that are matched against it are treated as UTF-8 strings
|
||||
instead of just strings of bytes.
|
||||
with the PCRE_UTF8 option flag, or the pattern must start with the sequence
|
||||
(*UTF8). When either of these is the case, both the pattern and any subject
|
||||
strings that are matched against it are treated as UTF-8 strings instead of
|
||||
just strings of bytes.
|
||||
</P>
|
||||
<P>
|
||||
If you compile PCRE with UTF-8 support, but do not use it at run time, the
|
||||
library will be a bit bigger, but the additional run time overhead is limited
|
||||
to testing the PCRE_UTF8 flag in several places, so should not be very large.
|
||||
to testing the PCRE_UTF8 flag occasionally, so should not be very big.
|
||||
</P>
|
||||
<P>
|
||||
If PCRE is built with Unicode character property support (which implies UTF-8
|
||||
@@ -172,56 +182,95 @@ documentation. Only the short names for properties are supported. For example,
|
||||
\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
|
||||
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
|
||||
compatibility with Perl 5.6. PCRE does not support this.
|
||||
<a name="utf8strings"></a></P>
|
||||
<br><b>
|
||||
Validity of UTF-8 strings
|
||||
</b><br>
|
||||
<P>
|
||||
When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
|
||||
are (by default) checked for validity on entry to the relevant functions. From
|
||||
release 7.3 of PCRE, the check is according the rules of RFC 3629, which are
|
||||
themselves derived from the Unicode specification. Earlier releases of PCRE
|
||||
followed the rules of RFC 2279, which allows the full range of 31-bit values (0
|
||||
to 0x7FFFFFFF). The current check allows only values in the range U+0 to
|
||||
U+10FFFF, excluding U+D800 to U+DFFF.
|
||||
</P>
|
||||
<P>
|
||||
The following comments apply when PCRE is running in UTF-8 mode:
|
||||
The excluded code points are the "Low Surrogate Area" of Unicode, of which the
|
||||
Unicode Standard says this: "The Low Surrogate Area does not contain any
|
||||
character assignments, consequently no character code charts or namelists are
|
||||
provided for this area. Surrogates are reserved for use with UTF-16 and then
|
||||
must be used in pairs." The code points that are encoded by UTF-16 pairs are
|
||||
available as independent code points in the UTF-8 encoding. (In other words,
|
||||
the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up
|
||||
UTF-8.)
|
||||
</P>
|
||||
<P>
|
||||
1. When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
|
||||
are checked for validity on entry to the relevant functions. If an invalid
|
||||
UTF-8 string is passed, an error return is given. In some situations, you may
|
||||
already know that your strings are valid, and therefore want to skip these
|
||||
checks in order to improve performance. If you set the PCRE_NO_UTF8_CHECK flag
|
||||
at compile time or at run time, PCRE assumes that the pattern or subject it
|
||||
is given (respectively) contains only valid UTF-8 codes. In this case, it does
|
||||
not diagnose an invalid UTF-8 string. If you pass an invalid UTF-8 string to
|
||||
PCRE when PCRE_NO_UTF8_CHECK is set, the results are undefined. Your program
|
||||
may crash.
|
||||
If an invalid UTF-8 string is passed to PCRE, an error return
|
||||
(PCRE_ERROR_BADUTF8) is given. In some situations, you may already know that
|
||||
your strings are valid, and therefore want to skip these checks in order to
|
||||
improve performance. If you set the PCRE_NO_UTF8_CHECK flag at compile time or
|
||||
at run time, PCRE assumes that the pattern or subject it is given
|
||||
(respectively) contains only valid UTF-8 codes. In this case, it does not
|
||||
diagnose an invalid UTF-8 string.
|
||||
</P>
|
||||
<P>
|
||||
2. An unbraced hexadecimal escape sequence (such as \xb3) matches a two-byte
|
||||
If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set, what
|
||||
happens depends on why the string is invalid. If the string conforms to the
|
||||
"old" definition of UTF-8 (RFC 2279), it is processed as a string of characters
|
||||
in the range 0 to 0x7FFFFFFF. In other words, apart from the initial validity
|
||||
test, PCRE (when in UTF-8 mode) handles strings according to the more liberal
|
||||
rules of RFC 2279. However, if the string does not even conform to RFC 2279,
|
||||
the result is undefined. Your program may crash.
|
||||
</P>
|
||||
<P>
|
||||
If you want to process strings of values in the full range 0 to 0x7FFFFFFF,
|
||||
encoded in a UTF-8-like manner as per the old RFC, you can set
|
||||
PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in this
|
||||
situation, you will have to apply your own validity check.
|
||||
</P>
|
||||
<br><b>
|
||||
General comments about UTF-8 mode
|
||||
</b><br>
|
||||
<P>
|
||||
1. An unbraced hexadecimal escape sequence (such as \xb3) matches a two-byte
|
||||
UTF-8 character if the value is greater than 127.
|
||||
</P>
|
||||
<P>
|
||||
3. Octal numbers up to \777 are recognized, and match two-byte UTF-8
|
||||
2. Octal numbers up to \777 are recognized, and match two-byte UTF-8
|
||||
characters for values greater than \177.
|
||||
</P>
|
||||
<P>
|
||||
4. Repeat quantifiers apply to complete UTF-8 characters, not to individual
|
||||
3. Repeat quantifiers apply to complete UTF-8 characters, not to individual
|
||||
bytes, for example: \x{100}{3}.
|
||||
</P>
|
||||
<P>
|
||||
5. The dot metacharacter matches one UTF-8 character instead of a single byte.
|
||||
4. The dot metacharacter matches one UTF-8 character instead of a single byte.
|
||||
</P>
|
||||
<P>
|
||||
6. The escape sequence \C can be used to match a single byte in UTF-8 mode,
|
||||
5. The escape sequence \C can be used to match a single byte in UTF-8 mode,
|
||||
but its use can lead to some strange effects. This facility is not available in
|
||||
the alternative matching function, <b>pcre_dfa_exec()</b>.
|
||||
</P>
|
||||
<P>
|
||||
7. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
|
||||
6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
|
||||
test characters of any code value, but the characters that PCRE recognizes as
|
||||
digits, spaces, or word characters remain the same set as before, all with
|
||||
values less than 256. This remains true even when PCRE includes Unicode
|
||||
property support, because to do otherwise would slow down PCRE in many common
|
||||
cases. If you really want to test for a wider sense of, say, "digit", you
|
||||
must use Unicode property tests such as \p{Nd}.
|
||||
must use Unicode property tests such as \p{Nd}. Note that this also applies to
|
||||
\b, because it is defined in terms of \w and \W.
|
||||
</P>
|
||||
<P>
|
||||
8. Similarly, characters that match the POSIX named character classes are all
|
||||
7. Similarly, characters that match the POSIX named character classes are all
|
||||
low-valued characters.
|
||||
</P>
|
||||
<P>
|
||||
8. However, the Perl 5.10 horizontal and vertical whitespace matching escapes
|
||||
(\h, \H, \v, and \V) do match all the appropriate Unicode characters.
|
||||
</P>
|
||||
<P>
|
||||
9. Case-insensitive matching applies only to characters whose values are less
|
||||
than 128, unless PCRE is built with Unicode property support. Even when Unicode
|
||||
property support is available, PCRE still uses its own character tables when
|
||||
@@ -236,17 +285,22 @@ these are not supported by PCRE.
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service,
|
||||
University Computing Service
|
||||
<br>
|
||||
Cambridge CB2 3QH, England.
|
||||
<br>
|
||||
Cambridge CB2 3QG, England.
|
||||
</P>
|
||||
<P>
|
||||
Putting an actual email address here seems to have been a spam magnet, so I've
|
||||
taken it away. If you want to email me, use my initial and surname, separated
|
||||
by a dot, at the domain ucs.cam.ac.uk.
|
||||
Last updated: 05 June 2006
|
||||
taken it away. If you want to email me, use my two initials, followed by the
|
||||
two digits 10, at the domain cam.ac.uk.
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 11 April 2009
|
||||
<br>
|
||||
Copyright © 1997-2009 University of Cambridge.
|
||||
<br>
|
||||
Copyright © 1997-2006 University of Cambridge.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
|
||||
@@ -27,8 +27,9 @@ SYNOPSIS
|
||||
DESCRIPTION
|
||||
</b><br>
|
||||
<P>
|
||||
This function compiles a regular expression into an internal form. Its
|
||||
arguments are:
|
||||
This function compiles a regular expression into an internal form. It is the
|
||||
same as <b>pcre_compile2()</b>, except for the absence of the <i>errorcodeptr</i>
|
||||
argument. Its arguments are:
|
||||
<pre>
|
||||
<i>pattern</i> A zero-terminated string containing the
|
||||
regular expression to be compiled
|
||||
@@ -40,34 +41,42 @@ arguments are:
|
||||
</pre>
|
||||
The option bits are:
|
||||
<pre>
|
||||
PCRE_ANCHORED Force pattern anchoring
|
||||
PCRE_AUTO_CALLOUT Compile automatic callouts
|
||||
PCRE_CASELESS Do caseless matching
|
||||
PCRE_DOLLAR_ENDONLY $ not to match newline at end
|
||||
PCRE_DOTALL . matches anything including NL
|
||||
PCRE_DUPNAMES Allow duplicate names for subpatterns
|
||||
PCRE_EXTENDED Ignore whitespace and # comments
|
||||
PCRE_EXTRA PCRE extra features
|
||||
(not much use currently)
|
||||
PCRE_FIRSTLINE Force matching to be before newline
|
||||
PCRE_MULTILINE ^ and $ match newlines within data
|
||||
PCRE_NEWLINE_CR Set CR as the newline sequence
|
||||
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
|
||||
PCRE_NEWLINE_LF Set LF as the newline sequence
|
||||
PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
|
||||
theses (named ones available)
|
||||
PCRE_UNGREEDY Invert greediness of quantifiers
|
||||
PCRE_UTF8 Run in UTF-8 mode
|
||||
PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
|
||||
validity (only relevant if
|
||||
PCRE_UTF8 is set)
|
||||
PCRE_ANCHORED Force pattern anchoring
|
||||
PCRE_AUTO_CALLOUT Compile automatic callouts
|
||||
PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF
|
||||
PCRE_BSR_UNICODE \R matches all Unicode line endings
|
||||
PCRE_CASELESS Do caseless matching
|
||||
PCRE_DOLLAR_ENDONLY $ not to match newline at end
|
||||
PCRE_DOTALL . matches anything including NL
|
||||
PCRE_DUPNAMES Allow duplicate names for subpatterns
|
||||
PCRE_EXTENDED Ignore whitespace and # comments
|
||||
PCRE_EXTRA PCRE extra features
|
||||
(not much use currently)
|
||||
PCRE_FIRSTLINE Force matching to be before newline
|
||||
PCRE_JAVASCRIPT_COMPAT JavaScript compatibility
|
||||
PCRE_MULTILINE ^ and $ match newlines within data
|
||||
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
|
||||
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline
|
||||
sequences
|
||||
PCRE_NEWLINE_CR Set CR as the newline sequence
|
||||
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
|
||||
PCRE_NEWLINE_LF Set LF as the newline sequence
|
||||
PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
|
||||
theses (named ones available)
|
||||
PCRE_UNGREEDY Invert greediness of quantifiers
|
||||
PCRE_UTF8 Run in UTF-8 mode
|
||||
PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
|
||||
validity (only relevant if
|
||||
PCRE_UTF8 is set)
|
||||
</pre>
|
||||
PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
|
||||
PCRE_NO_UTF8_CHECK.
|
||||
</P>
|
||||
<P>
|
||||
The yield of the function is a pointer to a private data structure that
|
||||
contains the compiled pattern, or NULL if an error was detected.
|
||||
contains the compiled pattern, or NULL if an error was detected. Note that
|
||||
compiling regular expressions with one version of PCRE for use with a different
|
||||
version is not guaranteed to work and may cause crashes.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE native API in the
|
||||
|
||||
@@ -56,6 +56,8 @@ The option bits are:
|
||||
(not much use currently)
|
||||
PCRE_FIRSTLINE Force matching to be before newline
|
||||
PCRE_MULTILINE ^ and $ match newlines within data
|
||||
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
|
||||
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
|
||||
PCRE_NEWLINE_CR Set CR as the newline sequence
|
||||
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
|
||||
PCRE_NEWLINE_LF Set LF as the newline sequence
|
||||
@@ -72,7 +74,9 @@ PCRE_NO_UTF8_CHECK.
|
||||
</P>
|
||||
<P>
|
||||
The yield of the function is a pointer to a private data structure that
|
||||
contains the compiled pattern, or NULL if an error was detected.
|
||||
contains the compiled pattern, or NULL if an error was detected. Note that
|
||||
compiling regular expressions with one version of PCRE for use with a different
|
||||
version is not guaranteed to work and may cause crashes.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE native API in the
|
||||
|
||||
@@ -38,7 +38,15 @@ The available codes are:
|
||||
PCRE_CONFIG_MATCH_LIMIT Internal resource limit
|
||||
PCRE_CONFIG_MATCH_LIMIT_RECURSION
|
||||
Internal recursion depth limit
|
||||
PCRE_CONFIG_NEWLINE Value of the newline sequence
|
||||
PCRE_CONFIG_NEWLINE Value of the default newline sequence:
|
||||
13 (0x000d) for CR
|
||||
10 (0x000a) for LF
|
||||
3338 (0x0d0a) for CRLF
|
||||
-2 for ANYCRLF
|
||||
-1 for ANY
|
||||
PCRE_CONFIG_BSR Indicates what \R matches by default:
|
||||
0 all Unicode line endings
|
||||
1 CR, LF, or CRLF only
|
||||
PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
|
||||
Threshold of return slots, above
|
||||
which <b>malloc()</b> is used by
|
||||
|
||||
@@ -37,7 +37,7 @@ buffer. The arguments are:
|
||||
<i>buffer</i> Buffer to receive the string
|
||||
<i>buffersize</i> Size of buffer
|
||||
</pre>
|
||||
The yield is the legnth of the string, PCRE_ERROR_NOMEMORY if the buffer was
|
||||
The yield is the length of the string, PCRE_ERROR_NOMEMORY if the buffer was
|
||||
too small, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
|
||||
</P>
|
||||
<P>
|
||||
|
||||
@@ -29,9 +29,9 @@ DESCRIPTION
|
||||
</b><br>
|
||||
<P>
|
||||
This function matches a compiled regular expression against a given subject
|
||||
string, using a DFA matching algorithm (<i>not</i> Perl-compatible). Note that
|
||||
the main, Perl-compatible, matching function is <b>pcre_exec()</b>. The
|
||||
arguments for this function are:
|
||||
string, using an alternative matching algorithm that scans the subject string
|
||||
just once (<i>not</i> Perl-compatible). Note that the main, Perl-compatible,
|
||||
matching function is <b>pcre_exec()</b>. The arguments for this function are:
|
||||
<pre>
|
||||
<i>code</i> Points to the compiled pattern
|
||||
<i>extra</i> Points to an associated <b>pcre_extra</b> structure,
|
||||
@@ -49,12 +49,17 @@ arguments for this function are:
|
||||
The options are:
|
||||
<pre>
|
||||
PCRE_ANCHORED Match only at the first position
|
||||
PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF
|
||||
PCRE_BSR_UNICODE \R matches all Unicode line endings
|
||||
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
|
||||
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
|
||||
PCRE_NEWLINE_CR Set CR as the newline sequence
|
||||
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
|
||||
PCRE_NEWLINE_LF Set LF as the newline sequence
|
||||
PCRE_NOTBOL Subject is not the beginning of a line
|
||||
PCRE_NOTEOL Subject is not the end of a line
|
||||
PCRE_NOTEMPTY An empty string is not a valid match
|
||||
PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
|
||||
PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
|
||||
validity (only relevant if PCRE_UTF8
|
||||
was set at compile time)
|
||||
@@ -62,8 +67,8 @@ The options are:
|
||||
PCRE_DFA_SHORTEST Return only the shortest match
|
||||
PCRE_DFA_RESTART This is a restart after a partial match
|
||||
</pre>
|
||||
There are restrictions on what may appear in a pattern when matching using the
|
||||
DFA algorithm is requested. Details are given in the
|
||||
There are restrictions on what may appear in a pattern when using this matching
|
||||
function. Details are given in the
|
||||
<a href="pcrematching.html"><b>pcrematching</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
@@ -79,7 +84,7 @@ A <b>pcre_extra</b> structure contains the following fields:
|
||||
</pre>
|
||||
The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
|
||||
PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
|
||||
PCRE_EXTRA_TABLES. For DFA matching, the <i>match_limit</i> and
|
||||
PCRE_EXTRA_TABLES. For this matching function, the <i>match_limit</i> and
|
||||
<i>match_limit_recursion</i> fields are not used, and must not be set.
|
||||
</P>
|
||||
<P>
|
||||
|
||||
@@ -45,19 +45,26 @@ offsets to captured substrings. Its arguments are:
|
||||
The options are:
|
||||
<pre>
|
||||
PCRE_ANCHORED Match only at the first position
|
||||
PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF
|
||||
PCRE_BSR_UNICODE \R matches all Unicode line endings
|
||||
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
|
||||
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
|
||||
PCRE_NEWLINE_CR Set CR as the newline sequence
|
||||
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
|
||||
PCRE_NEWLINE_LF Set LF as the newline sequence
|
||||
PCRE_NOTBOL Subject is not the beginning of a line
|
||||
PCRE_NOTEOL Subject is not the end of a line
|
||||
PCRE_NOTEMPTY An empty string is not a valid match
|
||||
PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
|
||||
PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
|
||||
validity (only relevant if PCRE_UTF8
|
||||
was set at compile time)
|
||||
PCRE_PARTIAL Return PCRE_ERROR_PARTIAL for a partial match
|
||||
</pre>
|
||||
There are restrictions on what may appear in a pattern when partial matching is
|
||||
requested.
|
||||
requested. For details, see the
|
||||
<a href="pcrepartial.html"><b>pcrepartial</b></a>
|
||||
page.
|
||||
</P>
|
||||
<P>
|
||||
A <b>pcre_extra</b> structure contains the following fields:
|
||||
|
||||
@@ -42,13 +42,14 @@ The following information is available:
|
||||
-1 for start of string
|
||||
or after newline, or
|
||||
-2 otherwise
|
||||
PCRE_INFO_FIRSTTABLE Table of first bytes
|
||||
(after studying)
|
||||
PCRE_INFO_FIRSTTABLE Table of first bytes (after studying)
|
||||
PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
|
||||
PCRE_INFO_LASTLITERAL Literal last byte required
|
||||
PCRE_INFO_NAMECOUNT Number of named subpatterns
|
||||
PCRE_INFO_NAMEENTRYSIZE Size of name table entry
|
||||
PCRE_INFO_NAMETABLE Pointer to name table
|
||||
PCRE_INFO_OPTIONS Options used for compilation
|
||||
PCRE_INFO_OKPARTIAL Return 1 if partial matching can be tried
|
||||
PCRE_INFO_OPTIONS Option bits used for compilation
|
||||
PCRE_INFO_SIZE Size of compiled pattern
|
||||
PCRE_INFO_STUDYSIZE Size of study data
|
||||
</pre>
|
||||
|
||||
@@ -39,9 +39,10 @@ arguments are:
|
||||
<i>stringptr</i> Where to put the string pointer
|
||||
</pre>
|
||||
The memory in which the substring is placed is obtained by calling
|
||||
<b>pcre_malloc()</b>. The yield of the function is the length of the extracted
|
||||
substring, PCRE_ERROR_NOMEMORY if sufficient memory could not be obtained, or
|
||||
PCRE_ERROR_NOSUBSTRING if the string name is invalid.
|
||||
<b>pcre_malloc()</b>. The convenience function <b>pcre_free_substring()</b> can
|
||||
be used to free it when it is no longer needed. The yield of the function is
|
||||
the length of the extracted substring, PCRE_ERROR_NOMEMORY if sufficient memory
|
||||
could not be obtained, or PCRE_ERROR_NOSUBSTRING if the string name is invalid.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE native API in the
|
||||
|
||||
@@ -33,7 +33,10 @@ parenthesis in a compiled pattern. Its arguments are:
|
||||
<i>name</i> Name whose number is required
|
||||
</pre>
|
||||
The yield of the function is the number of the parenthesis if the name is
|
||||
found, or PCRE_ERROR_NOSUBSTRING otherwise.
|
||||
found, or PCRE_ERROR_NOSUBSTRING otherwise. When duplicate names are allowed
|
||||
(PCRE_DUPNAMES is set), it is not defined which of the numbers is returned by
|
||||
<b>pcre_get_stringnumber()</b>. You can obtain the complete list by calling
|
||||
<b>pcre_get_stringtable_entries()</b>.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE native API in the
|
||||
|
||||
@@ -44,7 +44,7 @@ PCRE_ERROR_NOSUBSTRING if none are found.
|
||||
There is a complete description of the PCRE native API, including the format of
|
||||
the table entries, in the
|
||||
<a href="pcreapi.html"><b>pcreapi</b></a>
|
||||
page and a description of the POSIX API in the
|
||||
page, and a description of the POSIX API in the
|
||||
<a href="pcreposix.html"><b>pcreposix</b></a>
|
||||
page.
|
||||
<p>
|
||||
|
||||
@@ -37,9 +37,10 @@ arguments are:
|
||||
<i>stringptr</i> Where to put the string pointer
|
||||
</pre>
|
||||
The memory in which the substring is placed is obtained by calling
|
||||
<b>pcre_malloc()</b>. The yield of the function is the length of the substring,
|
||||
PCRE_ERROR_NOMEMORY if sufficient memory could not be obtained, or
|
||||
PCRE_ERROR_NOSUBSTRING if the string number is invalid.
|
||||
<b>pcre_malloc()</b>. The convenience function <b>pcre_free_substring()</b> can
|
||||
be used to free it when it is no longer needed. The yield of the function is
|
||||
the length of the substring, PCRE_ERROR_NOMEMORY if sufficient memory could not
|
||||
be obtained, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE native API in the
|
||||
|
||||
@@ -35,10 +35,12 @@ substrings. The arguments are:
|
||||
<i>listptr</i> Where to put a pointer to the list
|
||||
</pre>
|
||||
The memory in which the substrings and the list are placed is obtained by
|
||||
calling <b>pcre_malloc()</b>. A pointer to a list of pointers is put in
|
||||
the variable whose address is in <i>listptr</i>. The list is terminated by a
|
||||
NULL pointer. The yield of the function is zero on success or
|
||||
PCRE_ERROR_NOMEMORY if sufficient memory could not be obtained.
|
||||
calling <b>pcre_malloc()</b>. The convenience function
|
||||
<b>pcre_free_substring_list()</b> can be used to free it when it is no longer
|
||||
needed. A pointer to a list of pointers is put in the variable whose address is
|
||||
in <i>listptr</i>. The list is terminated by a NULL pointer. The yield of the
|
||||
function is zero on success or PCRE_ERROR_NOMEMORY if sufficient memory could
|
||||
not be obtained.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE native API in the
|
||||
|
||||
+374
-139
@@ -32,6 +32,9 @@ man page, in case the conversion went wrong.
|
||||
<li><a name="TOC17" href="#SEC17">DUPLICATE SUBPATTERN NAMES</a>
|
||||
<li><a name="TOC18" href="#SEC18">FINDING ALL POSSIBLE MATCHES</a>
|
||||
<li><a name="TOC19" href="#SEC19">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
|
||||
<li><a name="TOC20" href="#SEC20">SEE ALSO</a>
|
||||
<li><a name="TOC21" href="#SEC21">AUTHOR</a>
|
||||
<li><a name="TOC22" href="#SEC22">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">PCRE NATIVE API</a><br>
|
||||
<P>
|
||||
@@ -140,8 +143,8 @@ man page, in case the conversion went wrong.
|
||||
</P>
|
||||
<br><a name="SEC2" href="#TOC1">PCRE API OVERVIEW</a><br>
|
||||
<P>
|
||||
PCRE has its own native API, which is described in this document. There is
|
||||
also a set of wrapper functions that correspond to the POSIX regular expression
|
||||
PCRE has its own native API, which is described in this document. There are
|
||||
also some wrapper functions that correspond to the POSIX regular expression
|
||||
API. These are described in the
|
||||
<a href="pcreposix.html"><b>pcreposix</b></a>
|
||||
documentation. Both of these APIs define a set of C function calls. A C++
|
||||
@@ -164,15 +167,15 @@ in a Perl-compatible manner. A sample program that demonstrates the simplest
|
||||
way of using them is provided in the file called <i>pcredemo.c</i> in the source
|
||||
distribution. The
|
||||
<a href="pcresample.html"><b>pcresample</b></a>
|
||||
documentation describes how to run it.
|
||||
documentation describes how to compile and run it.
|
||||
</P>
|
||||
<P>
|
||||
A second matching function, <b>pcre_dfa_exec()</b>, which is not
|
||||
Perl-compatible, is also provided. This uses a different algorithm for the
|
||||
matching. The alternative algorithm finds all possible matches (at a given
|
||||
point in the subject). However, this algorithm does not return captured
|
||||
substrings. A description of the two matching algorithms and their advantages
|
||||
and disadvantages is given in the
|
||||
point in the subject), and scans the subject just once. However, this algorithm
|
||||
does not return captured substrings. A description of the two matching
|
||||
algorithms and their advantages and disadvantages is given in the
|
||||
<a href="pcrematching.html"><b>pcrematching</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
@@ -240,19 +243,45 @@ by the caller to a "callout" function, which PCRE will then call at specified
|
||||
points during a matching operation. Details are given in the
|
||||
<a href="pcrecallout.html"><b>pcrecallout</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<a name="newlines"></a></P>
|
||||
<br><a name="SEC3" href="#TOC1">NEWLINES</a><br>
|
||||
<P>
|
||||
PCRE supports three different conventions for indicating line breaks in
|
||||
strings: a single CR character, a single LF character, or the two-character
|
||||
sequence CRLF. All three are used as "standard" by different operating systems.
|
||||
When PCRE is built, a default can be specified. The default default is LF,
|
||||
which is the Unix standard. When PCRE is run, the default can be overridden,
|
||||
either when a pattern is compiled, or when it is matched.
|
||||
<br>
|
||||
<br>
|
||||
PCRE supports five different conventions for indicating line breaks in
|
||||
strings: a single CR (carriage return) character, a single LF (linefeed)
|
||||
character, the two-character sequence CRLF, any of the three preceding, or any
|
||||
Unicode newline sequence. The Unicode newline sequences are the three just
|
||||
mentioned, plus the single characters VT (vertical tab, U+000B), FF (formfeed,
|
||||
U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
|
||||
(paragraph separator, U+2029).
|
||||
</P>
|
||||
<P>
|
||||
Each of the first three conventions is used by at least one operating system as
|
||||
its standard newline sequence. When PCRE is built, a default can be specified.
|
||||
The default default is LF, which is the Unix standard. When PCRE is run, the
|
||||
default can be overridden, either when a pattern is compiled, or when it is
|
||||
matched.
|
||||
</P>
|
||||
<P>
|
||||
At compile time, the newline convention can be specified by the <i>options</i>
|
||||
argument of <b>pcre_compile()</b>, or it can be specified by special text at the
|
||||
start of the pattern itself; this overrides any other settings. See the
|
||||
<a href="pcrepattern.html"><b>pcrepattern</b></a>
|
||||
page for details of the special character sequences.
|
||||
</P>
|
||||
<P>
|
||||
In the PCRE documentation the word "newline" is used to mean "the character or
|
||||
pair of characters that indicate a line break".
|
||||
pair of characters that indicate a line break". The choice of newline
|
||||
convention affects the handling of the dot, circumflex, and dollar
|
||||
metacharacters, the handling of #-comments in /x mode, and, when CRLF is a
|
||||
recognized line ending sequence, the match position advancement for a
|
||||
non-anchored pattern. There is more detail about this in the
|
||||
<a href="#execoptions">section on <b>pcre_exec()</b> options</a>
|
||||
below.
|
||||
</P>
|
||||
<P>
|
||||
The choice of newline convention does not affect the interpretation of
|
||||
the \n or \r escape sequences, nor does it affect what \R matches, which is
|
||||
controlled in a similar way, but by separate options.
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">MULTITHREADING</a><br>
|
||||
<P>
|
||||
@@ -271,7 +300,9 @@ The compiled form of a regular expression can be saved and re-used at a later
|
||||
time, possibly by a different program, and even on a host other than the one on
|
||||
which it was compiled. Details are given in the
|
||||
<a href="pcreprecompile.html"><b>pcreprecompile</b></a>
|
||||
documentation.
|
||||
documentation. However, compiling a regular expression with one version of PCRE
|
||||
for use with a different version is not guaranteed to work and may cause
|
||||
crashes.
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">CHECKING BUILD-TIME OPTIONS</a><br>
|
||||
<P>
|
||||
@@ -301,9 +332,18 @@ properties is available; otherwise it is set to zero.
|
||||
PCRE_CONFIG_NEWLINE
|
||||
</pre>
|
||||
The output is an integer whose value specifies the default character sequence
|
||||
that is recognized as meaning "newline". The three values that are supported
|
||||
are: 10 for LF, 13 for CR, and 3338 for CRLF. The default should normally be
|
||||
the standard sequence for your operating system.
|
||||
that is recognized as meaning "newline". The four values that are supported
|
||||
are: 10 for LF, 13 for CR, 3338 for CRLF, -2 for ANYCRLF, and -1 for ANY.
|
||||
Though they are derived from ASCII, the same values are returned in EBCDIC
|
||||
environments. The default should normally correspond to the standard sequence
|
||||
for your operating system.
|
||||
<pre>
|
||||
PCRE_CONFIG_BSR
|
||||
</pre>
|
||||
The output is an integer whose value indicates what character sequences the \R
|
||||
escape sequence matches by default. A value of 0 means that \R matches any
|
||||
Unicode line ending sequence; a value of 1 means that \R matches only CR, LF,
|
||||
or CRLF. The default can be overridden when a pattern is compiled or matched.
|
||||
<pre>
|
||||
PCRE_CONFIG_LINK_SIZE
|
||||
</pre>
|
||||
@@ -323,13 +363,13 @@ documentation.
|
||||
<pre>
|
||||
PCRE_CONFIG_MATCH_LIMIT
|
||||
</pre>
|
||||
The output is an integer that gives the default limit for the number of
|
||||
The output is a long integer that gives the default limit for the number of
|
||||
internal matching function calls in a <b>pcre_exec()</b> execution. Further
|
||||
details are given with <b>pcre_exec()</b> below.
|
||||
<pre>
|
||||
PCRE_CONFIG_MATCH_LIMIT_RECURSION
|
||||
</pre>
|
||||
The output is an integer that gives the default limit for the depth of
|
||||
The output is a long integer that gives the default limit for the depth of
|
||||
recursion when calling the internal matching function in a <b>pcre_exec()</b>
|
||||
execution. Further details are given with <b>pcre_exec()</b> below.
|
||||
<pre>
|
||||
@@ -374,16 +414,17 @@ fully relocatable, because it may contain a copy of the <i>tableptr</i>
|
||||
argument, which is an address (see below).
|
||||
</P>
|
||||
<P>
|
||||
The <i>options</i> argument contains independent bits that affect the
|
||||
The <i>options</i> argument contains various bit settings that affect the
|
||||
compilation. It should be zero if no options are required. The available
|
||||
options are described below. Some of them, in particular, those that are
|
||||
compatible with Perl, can also be set and unset from within the pattern (see
|
||||
the detailed description in the
|
||||
options are described below. Some of them (in particular, those that are
|
||||
compatible with Perl, but also some others) can also be set and unset from
|
||||
within the pattern (see the detailed description in the
|
||||
<a href="pcrepattern.html"><b>pcrepattern</b></a>
|
||||
documentation). For these options, the contents of the <i>options</i> argument
|
||||
specifies their initial settings at the start of compilation and execution. The
|
||||
PCRE_ANCHORED and PCRE_NEWLINE_<i>xxx</i> options can be set at the time of
|
||||
matching as well as at compile time.
|
||||
documentation). For those options that can be different in different parts of
|
||||
the pattern, the contents of the <i>options</i> argument specifies their initial
|
||||
settings at the start of compilation and execution. The PCRE_ANCHORED and
|
||||
PCRE_NEWLINE_<i>xxx</i> options can be set at the time of matching as well as at
|
||||
compile time.
|
||||
</P>
|
||||
<P>
|
||||
If <i>errptr</i> is NULL, <b>pcre_compile()</b> returns NULL immediately.
|
||||
@@ -439,6 +480,15 @@ all with number 255, before each pattern item. For discussion of the callout
|
||||
facility, see the
|
||||
<a href="pcrecallout.html"><b>pcrecallout</b></a>
|
||||
documentation.
|
||||
<pre>
|
||||
PCRE_BSR_ANYCRLF
|
||||
PCRE_BSR_UNICODE
|
||||
</pre>
|
||||
These options (which are mutually exclusive) control what the \R escape
|
||||
sequence matches. The choice is either to match only CR, LF, or CRLF, or to
|
||||
match any Unicode newline sequence. The default is specified when PCRE is
|
||||
built. It can be overridden from within the pattern, or by setting an option
|
||||
when a compiled pattern is matched.
|
||||
<pre>
|
||||
PCRE_CASELESS
|
||||
</pre>
|
||||
@@ -467,8 +517,8 @@ If this bit is set, a dot metacharater in the pattern matches all characters,
|
||||
including those that indicate newline. Without it, a dot does not match when
|
||||
the current position is at a newline. This option is equivalent to Perl's /s
|
||||
option, and it can be changed within a pattern by a (?s) option setting. A
|
||||
negative class such as [^a] always matches newlines, independent of the setting
|
||||
of this option.
|
||||
negative class such as [^a] always matches newline characters, independent of
|
||||
the setting of this option.
|
||||
<pre>
|
||||
PCRE_DUPNAMES
|
||||
</pre>
|
||||
@@ -510,6 +560,22 @@ this option. It can also be set by a (?X) option setting within a pattern.
|
||||
If this option is set, an unanchored pattern is required to match before or at
|
||||
the first newline in the subject string, though the matched text may continue
|
||||
over the newline.
|
||||
<pre>
|
||||
PCRE_JAVASCRIPT_COMPAT
|
||||
</pre>
|
||||
If this option is set, PCRE's behaviour is changed in some ways so that it is
|
||||
compatible with JavaScript rather than Perl. The changes are as follows:
|
||||
</P>
|
||||
<P>
|
||||
(1) A lone closing square bracket in a pattern causes a compile-time error,
|
||||
because this is illegal in JavaScript (by default it is treated as a data
|
||||
character). Thus, the pattern AB]CD becomes illegal when this option is set.
|
||||
</P>
|
||||
<P>
|
||||
(2) At run time, a back reference to an unset subpattern group matches an empty
|
||||
string (by default this causes the current matching alternative to fail). A
|
||||
pattern such as (\1)(a) succeeds when this option is set (assuming it can find
|
||||
an "a" in the subject), whereas it fails by default, for Perl compatibility.
|
||||
<pre>
|
||||
PCRE_MULTILINE
|
||||
</pre>
|
||||
@@ -531,19 +597,40 @@ occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect.
|
||||
PCRE_NEWLINE_CR
|
||||
PCRE_NEWLINE_LF
|
||||
PCRE_NEWLINE_CRLF
|
||||
PCRE_NEWLINE_ANYCRLF
|
||||
PCRE_NEWLINE_ANY
|
||||
</pre>
|
||||
These options override the default newline definition that was chosen when PCRE
|
||||
was built. Setting the first or the second specifies that a newline is
|
||||
indicated by a single character (CR or LF, respectively). Setting both of them
|
||||
specifies that a newline is indicated by the two-character CRLF sequence. For
|
||||
convenience, PCRE_NEWLINE_CRLF is defined to contain both bits. The only time
|
||||
that a line break is relevant when compiling a pattern is if PCRE_EXTENDED is
|
||||
set, and an unescaped # outside a character class is encountered. This
|
||||
indicates a comment that lasts until after the next newline.
|
||||
indicated by a single character (CR or LF, respectively). Setting
|
||||
PCRE_NEWLINE_CRLF specifies that a newline is indicated by the two-character
|
||||
CRLF sequence. Setting PCRE_NEWLINE_ANYCRLF specifies that any of the three
|
||||
preceding sequences should be recognized. Setting PCRE_NEWLINE_ANY specifies
|
||||
that any Unicode newline sequence should be recognized. The Unicode newline
|
||||
sequences are the three just mentioned, plus the single characters VT (vertical
|
||||
tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
|
||||
separator, U+2028), and PS (paragraph separator, U+2029). The last two are
|
||||
recognized only in UTF-8 mode.
|
||||
</P>
|
||||
<P>
|
||||
The newline option set at compile time becomes the default that is used for
|
||||
<b>pcre_exec()</b> and <b>pcre_dfa_exec()</b>, but it can be overridden.
|
||||
The newline setting in the options word uses three bits that are treated
|
||||
as a number, giving eight possibilities. Currently only six are used (default
|
||||
plus the five values above). This means that if you set more than one newline
|
||||
option, the combination may or may not be sensible. For example,
|
||||
PCRE_NEWLINE_CR with PCRE_NEWLINE_LF is equivalent to PCRE_NEWLINE_CRLF, but
|
||||
other combinations may yield unused numbers and cause an error.
|
||||
</P>
|
||||
<P>
|
||||
The only time that a line break is specially recognized when compiling a
|
||||
pattern is if PCRE_EXTENDED is set, and an unescaped # outside a character
|
||||
class is encountered. This indicates a comment that lasts until after the next
|
||||
line break sequence. In other circumstances, line break sequences are treated
|
||||
as literal data, except that in PCRE_EXTENDED mode, both CR and LF are treated
|
||||
as whitespace characters and are therefore ignored.
|
||||
</P>
|
||||
<P>
|
||||
The newline option that is set at compile time becomes the default that is used
|
||||
for <b>pcre_exec()</b> and <b>pcre_dfa_exec()</b>, but it can be overridden.
|
||||
<pre>
|
||||
PCRE_NO_AUTO_CAPTURE
|
||||
</pre>
|
||||
@@ -574,20 +661,24 @@ page.
|
||||
PCRE_NO_UTF8_CHECK
|
||||
</pre>
|
||||
When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is
|
||||
automatically checked. If an invalid UTF-8 sequence of bytes is found,
|
||||
<b>pcre_compile()</b> returns an error. If you already know that your pattern is
|
||||
valid, and you want to skip this check for performance reasons, you can set the
|
||||
PCRE_NO_UTF8_CHECK option. When it is set, the effect of passing an invalid
|
||||
UTF-8 string as a pattern is undefined. It may cause your program to crash.
|
||||
Note that this option can also be passed to <b>pcre_exec()</b> and
|
||||
<b>pcre_dfa_exec()</b>, to suppress the UTF-8 validity checking of subject
|
||||
strings.
|
||||
automatically checked. There is a discussion about the
|
||||
<a href="pcre.html#utf8strings">validity of UTF-8 strings</a>
|
||||
in the main
|
||||
<a href="pcre.html"><b>pcre</b></a>
|
||||
page. If an invalid UTF-8 sequence of bytes is found, <b>pcre_compile()</b>
|
||||
returns an error. If you already know that your pattern is valid, and you want
|
||||
to skip this check for performance reasons, you can set the PCRE_NO_UTF8_CHECK
|
||||
option. When it is set, the effect of passing an invalid UTF-8 string as a
|
||||
pattern is undefined. It may cause your program to crash. Note that this option
|
||||
can also be passed to <b>pcre_exec()</b> and <b>pcre_dfa_exec()</b>, to suppress
|
||||
the UTF-8 validity checking of subject strings.
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">COMPILATION ERROR CODES</a><br>
|
||||
<P>
|
||||
The following table lists the error codes than may be returned by
|
||||
<b>pcre_compile2()</b>, along with the error messages that may be returned by
|
||||
both compiling functions.
|
||||
both compiling functions. As PCRE has developed, some error codes have fallen
|
||||
out of use. To avoid confusion, they have not been re-used.
|
||||
<pre>
|
||||
0 no error
|
||||
1 \ at end of pattern
|
||||
@@ -599,17 +690,17 @@ both compiling functions.
|
||||
7 invalid escape sequence in character class
|
||||
8 range out of order in character class
|
||||
9 nothing to repeat
|
||||
10 operand of unlimited repeat could match the empty string
|
||||
10 [this code is not in use]
|
||||
11 internal error: unexpected repeat
|
||||
12 unrecognized character after (?
|
||||
12 unrecognized character after (? or (?-
|
||||
13 POSIX named classes are supported only within a class
|
||||
14 missing )
|
||||
15 reference to non-existent subpattern
|
||||
16 erroffset passed as NULL
|
||||
17 unknown option bit(s) set
|
||||
18 missing ) after comment
|
||||
19 parentheses nested too deeply
|
||||
20 regular expression too large
|
||||
19 [this code is not in use]
|
||||
20 regular expression is too large
|
||||
21 failed to get memory
|
||||
22 unmatched parentheses
|
||||
23 internal error: code overflow
|
||||
@@ -618,11 +709,11 @@ both compiling functions.
|
||||
26 malformed number or name after (?(
|
||||
27 conditional group contains more than two branches
|
||||
28 assertion expected after (?(
|
||||
29 (?R or (?digits must be followed by )
|
||||
29 (?R or (?[+-]digits must be followed by )
|
||||
30 unknown POSIX class name
|
||||
31 POSIX collating elements are not supported
|
||||
32 this version of PCRE is not compiled with PCRE_UTF8 support
|
||||
33 spare error
|
||||
33 [this code is not in use]
|
||||
34 character value in \x{...} sequence is too large
|
||||
35 invalid condition (?(0)
|
||||
36 \C not allowed in lookbehind assertion
|
||||
@@ -631,17 +722,33 @@ both compiling functions.
|
||||
39 closing ) for (?C expected
|
||||
40 recursive call could loop indefinitely
|
||||
41 unrecognized character after (?P
|
||||
42 syntax error after (?P
|
||||
42 syntax error in subpattern name (missing terminator)
|
||||
43 two named subpatterns have the same name
|
||||
44 invalid UTF-8 string
|
||||
45 support for \P, \p, and \X has not been compiled
|
||||
46 malformed \P or \p sequence
|
||||
47 unknown property name after \P or \p
|
||||
48 subpattern name is too long (maximum 32 characters)
|
||||
49 too many named subpatterns (maximum 10,000)
|
||||
50 repeated subpattern is too long
|
||||
49 too many named subpatterns (maximum 10000)
|
||||
50 [this code is not in use]
|
||||
51 octal value is greater than \377 (not in UTF-8 mode)
|
||||
</PRE>
|
||||
52 internal error: overran compiling workspace
|
||||
53 internal error: previously-checked referenced subpattern not found
|
||||
54 DEFINE group contains more than one branch
|
||||
55 repeating a DEFINE group is not allowed
|
||||
56 inconsistent NEWLINE options
|
||||
57 \g is not followed by a braced, angle-bracketed, or quoted
|
||||
name/number or by a plain number
|
||||
58 a numbered reference must not be zero
|
||||
59 (*VERB) with an argument is not supported
|
||||
60 (*VERB) not recognized
|
||||
61 number is too big
|
||||
62 subpattern name expected
|
||||
63 digit expected after (?+
|
||||
64 ] is an invalid data character in JavaScript compatibility mode
|
||||
</pre>
|
||||
The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
|
||||
be used if the limits were changed when PCRE was built.
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">STUDYING A PATTERN</a><br>
|
||||
<P>
|
||||
@@ -698,20 +805,27 @@ bytes is created.
|
||||
<a name="localesupport"></a></P>
|
||||
<br><a name="SEC10" href="#TOC1">LOCALE SUPPORT</a><br>
|
||||
<P>
|
||||
PCRE handles caseless matching, and determines whether characters are letters
|
||||
PCRE handles caseless matching, and determines whether characters are letters,
|
||||
digits, or whatever, by reference to a set of tables, indexed by character
|
||||
value. When running in UTF-8 mode, this applies only to characters with codes
|
||||
less than 128. Higher-valued codes never match escapes such as \w or \d, but
|
||||
can be tested with \p if PCRE is built with Unicode character property
|
||||
support. The use of locales with Unicode is discouraged.
|
||||
support. The use of locales with Unicode is discouraged. If you are handling
|
||||
characters with codes greater than 128, you should either use UTF-8 and
|
||||
Unicode, or use locales, but not try to mix the two.
|
||||
</P>
|
||||
<P>
|
||||
An internal set of tables is created in the default C locale when PCRE is
|
||||
built. This is used when the final argument of <b>pcre_compile()</b> is NULL,
|
||||
and is sufficient for many applications. An alternative set of tables can,
|
||||
however, be supplied. These may be created in a different locale from the
|
||||
default. As more and more applications change to using Unicode, the need for
|
||||
this locale support is expected to die away.
|
||||
PCRE contains an internal set of tables that are used when the final argument
|
||||
of <b>pcre_compile()</b> is NULL. These are sufficient for many applications.
|
||||
Normally, the internal tables recognize only ASCII characters. However, when
|
||||
PCRE is built, it is possible to cause the internal tables to be rebuilt in the
|
||||
default "C" locale of the local system, which may cause them to be different.
|
||||
</P>
|
||||
<P>
|
||||
The internal tables can always be overridden by tables supplied by the
|
||||
application that calls PCRE. These may be created in a different locale from
|
||||
the default. As more and more applications change to using Unicode, the need
|
||||
for this locale support is expected to die away.
|
||||
</P>
|
||||
<P>
|
||||
External tables are built by calling the <b>pcre_maketables()</b> function,
|
||||
@@ -725,6 +839,10 @@ the following code could be used:
|
||||
tables = pcre_maketables();
|
||||
re = pcre_compile(..., tables);
|
||||
</pre>
|
||||
The locale name "fr_FR" is used on Linux and other Unix-like systems; if you
|
||||
are using Windows, the name for the French locale is "french".
|
||||
</P>
|
||||
<P>
|
||||
When <b>pcre_maketables()</b> runs, the tables are built in memory that is
|
||||
obtained via <b>pcre_malloc</b>. It is the caller's responsibility to ensure
|
||||
that the memory containing the tables remains available for as long as it is
|
||||
@@ -810,7 +928,7 @@ still recognized for backwards compatibility.)
|
||||
</P>
|
||||
<P>
|
||||
If there is a fixed first byte, for example, from a pattern such as
|
||||
(cat|cow|coyote). Otherwise, if either
|
||||
(cat|cow|coyote), its value is returned. Otherwise, if either
|
||||
<br>
|
||||
<br>
|
||||
(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
|
||||
@@ -831,6 +949,18 @@ If the pattern was studied, and this resulted in the construction of a 256-bit
|
||||
table indicating a fixed set of bytes for the first byte in any matching
|
||||
string, a pointer to the table is returned. Otherwise NULL is returned. The
|
||||
fourth argument should point to an <b>unsigned char *</b> variable.
|
||||
<pre>
|
||||
PCRE_INFO_HASCRORLF
|
||||
</pre>
|
||||
Return 1 if the pattern contains any explicit matches for CR or LF characters,
|
||||
otherwise 0. The fourth argument should point to an <b>int</b> variable. An
|
||||
explicit match is either a literal CR or LF character, or \r or \n.
|
||||
<pre>
|
||||
PCRE_INFO_JCHANGED
|
||||
</pre>
|
||||
Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise
|
||||
0. The fourth argument should point to an <b>int</b> variable. (?J) and
|
||||
(?-J) set and unset the local PCRE_DUPNAMES option, respectively.
|
||||
<pre>
|
||||
PCRE_INFO_LASTLITERAL
|
||||
</pre>
|
||||
@@ -868,7 +998,7 @@ alphabetical order. When PCRE_DUPNAMES is set, duplicate names are in order of
|
||||
their parentheses numbers. For example, consider the following pattern (assume
|
||||
PCRE_EXTENDED is set, so white space - including newlines - is ignored):
|
||||
<pre>
|
||||
(?P<date> (?P<year>(\d\d)?\d\d) - (?P<month>\d\d) - (?P<day>\d\d) )
|
||||
(?<date> (?<year>(\d\d)?\d\d) - (?<month>\d\d) - (?<day>\d\d) )
|
||||
</pre>
|
||||
There are four named subpatterns, so the table has four entries, and each entry
|
||||
in the table is eight bytes long. The table is as follows, with non-printing
|
||||
@@ -882,13 +1012,24 @@ bytes shows in hexadecimal, and undefined bytes shown as ??:
|
||||
When writing code to extract data from named subpatterns using the
|
||||
name-to-number map, remember that the length of the entries is likely to be
|
||||
different for each compiled pattern.
|
||||
<pre>
|
||||
PCRE_INFO_OKPARTIAL
|
||||
</pre>
|
||||
Return 1 if the pattern can be used for partial matching, otherwise 0. The
|
||||
fourth argument should point to an <b>int</b> variable. The
|
||||
<a href="pcrepartial.html"><b>pcrepartial</b></a>
|
||||
documentation lists the restrictions that apply to patterns when partial
|
||||
matching is used.
|
||||
<pre>
|
||||
PCRE_INFO_OPTIONS
|
||||
</pre>
|
||||
Return a copy of the options with which the pattern was compiled. The fourth
|
||||
argument should point to an <b>unsigned long int</b> variable. These option bits
|
||||
are those specified in the call to <b>pcre_compile()</b>, modified by any
|
||||
top-level option settings within the pattern itself.
|
||||
top-level option settings at the start of the pattern itself. In other words,
|
||||
they are the options that will be in force when matching starts. For example,
|
||||
if the pattern /(?im)abc(?-i)d/ is compiled with the PCRE_EXTENDED option, the
|
||||
result is PCRE_CASELESS, PCRE_MULTILINE, and PCRE_EXTENDED.
|
||||
</P>
|
||||
<P>
|
||||
A pattern is automatically anchored by PCRE if all of its top-level
|
||||
@@ -1097,14 +1238,15 @@ the external tables might be at a different address when <b>pcre_exec()</b> is
|
||||
called. See the
|
||||
<a href="pcreprecompile.html"><b>pcreprecompile</b></a>
|
||||
documentation for a discussion of saving compiled patterns for later use.
|
||||
</P>
|
||||
<a name="execoptions"></a></P>
|
||||
<br><b>
|
||||
Option bits for <b>pcre_exec()</b>
|
||||
</b><br>
|
||||
<P>
|
||||
The unused bits of the <i>options</i> argument for <b>pcre_exec()</b> must be
|
||||
zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_<i>xxx</i>,
|
||||
PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_UTF8_CHECK and PCRE_PARTIAL.
|
||||
PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_START_OPTIMIZE,
|
||||
PCRE_NO_UTF8_CHECK and PCRE_PARTIAL.
|
||||
<pre>
|
||||
PCRE_ANCHORED
|
||||
</pre>
|
||||
@@ -1112,15 +1254,52 @@ The PCRE_ANCHORED option limits <b>pcre_exec()</b> to matching at the first
|
||||
matching position. If a pattern was compiled with PCRE_ANCHORED, or turned out
|
||||
to be anchored by virtue of its contents, it cannot be made unachored at
|
||||
matching time.
|
||||
<pre>
|
||||
PCRE_BSR_ANYCRLF
|
||||
PCRE_BSR_UNICODE
|
||||
</pre>
|
||||
These options (which are mutually exclusive) control what the \R escape
|
||||
sequence matches. The choice is either to match only CR, LF, or CRLF, or to
|
||||
match any Unicode newline sequence. These options override the choice that was
|
||||
made or defaulted when the pattern was compiled.
|
||||
<pre>
|
||||
PCRE_NEWLINE_CR
|
||||
PCRE_NEWLINE_LF
|
||||
PCRE_NEWLINE_CRLF
|
||||
PCRE_NEWLINE_ANYCRLF
|
||||
PCRE_NEWLINE_ANY
|
||||
</pre>
|
||||
These options override the newline definition that was chosen or defaulted when
|
||||
the pattern was compiled. For details, see the description <b>pcre_compile()</b>
|
||||
above. During matching, the newline choice affects the behaviour of the dot,
|
||||
circumflex, and dollar metacharacters.
|
||||
the pattern was compiled. For details, see the description of
|
||||
<b>pcre_compile()</b> above. During matching, the newline choice affects the
|
||||
behaviour of the dot, circumflex, and dollar metacharacters. It may also alter
|
||||
the way the match position is advanced after a match failure for an unanchored
|
||||
pattern.
|
||||
</P>
|
||||
<P>
|
||||
When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is set, and a
|
||||
match attempt for an unanchored pattern fails when the current position is at a
|
||||
CRLF sequence, and the pattern contains no explicit matches for CR or LF
|
||||
characters, the match position is advanced by two characters instead of one, in
|
||||
other words, to after the CRLF.
|
||||
</P>
|
||||
<P>
|
||||
The above rule is a compromise that makes the most common cases work as
|
||||
expected. For example, if the pattern is .+A (and the PCRE_DOTALL option is not
|
||||
set), it does not match the string "\r\nA" because, after failing at the
|
||||
start, it skips both the CR and the LF before retrying. However, the pattern
|
||||
[\r\n]A does match that string, because it contains an explicit CR or LF
|
||||
reference, and so advances only by one character after the first failure.
|
||||
</P>
|
||||
<P>
|
||||
An explicit match for CR of LF is either a literal appearance of one of those
|
||||
characters, or one of the \r or \n escape sequences. Implicit matches such as
|
||||
[^X] do not count, nor does \s (which includes CR and LF in the characters
|
||||
that it matches).
|
||||
</P>
|
||||
<P>
|
||||
Notwithstanding the above, anomalous effects may still occur when CRLF is a
|
||||
valid newline sequence and explicit \r or \n escapes appear in the pattern.
|
||||
<pre>
|
||||
PCRE_NOTBOL
|
||||
</pre>
|
||||
@@ -1158,15 +1337,30 @@ matching a null string by first trying the match again at the same offset with
|
||||
PCRE_NOTEMPTY and PCRE_ANCHORED, and then if that fails by advancing the
|
||||
starting offset (see below) and trying an ordinary match again. There is some
|
||||
code that demonstrates how to do this in the <i>pcredemo.c</i> sample program.
|
||||
<pre>
|
||||
PCRE_NO_START_OPTIMIZE
|
||||
</pre>
|
||||
There are a number of optimizations that <b>pcre_exec()</b> uses at the start of
|
||||
a match, in order to speed up the process. For example, if it is known that a
|
||||
match must start with a specific character, it searches the subject for that
|
||||
character, and fails immediately if it cannot find it, without actually running
|
||||
the main matching function. When callouts are in use, these optimizations can
|
||||
cause them to be skipped. This option disables the "start-up" optimizations,
|
||||
causing performance to suffer, but ensuring that the callouts do occur.
|
||||
<pre>
|
||||
PCRE_NO_UTF8_CHECK
|
||||
</pre>
|
||||
When PCRE_UTF8 is set at compile time, the validity of the subject as a UTF-8
|
||||
string is automatically checked when <b>pcre_exec()</b> is subsequently called.
|
||||
The value of <i>startoffset</i> is also checked to ensure that it points to the
|
||||
start of a UTF-8 character. If an invalid UTF-8 sequence of bytes is found,
|
||||
<b>pcre_exec()</b> returns the error PCRE_ERROR_BADUTF8. If <i>startoffset</i>
|
||||
contains an invalid value, PCRE_ERROR_BADUTF8_OFFSET is returned.
|
||||
start of a UTF-8 character. There is a discussion about the validity of UTF-8
|
||||
strings in the
|
||||
<a href="pcre.html#utf8strings">section on UTF-8 support</a>
|
||||
in the main
|
||||
<a href="pcre.html"><b>pcre</b></a>
|
||||
page. If an invalid UTF-8 sequence of bytes is found, <b>pcre_exec()</b> returns
|
||||
the error PCRE_ERROR_BADUTF8. If <i>startoffset</i> contains an invalid value,
|
||||
PCRE_ERROR_BADUTF8_OFFSET is returned.
|
||||
</P>
|
||||
<P>
|
||||
If you already know that your subject is valid, and you want to skip these
|
||||
@@ -1196,11 +1390,11 @@ The string to be matched by <b>pcre_exec()</b>
|
||||
</b><br>
|
||||
<P>
|
||||
The subject string is passed to <b>pcre_exec()</b> as a pointer in
|
||||
<i>subject</i>, a length in <i>length</i>, and a starting byte offset in
|
||||
<i>startoffset</i>. In UTF-8 mode, the byte offset must point to the start of a
|
||||
UTF-8 character. Unlike the pattern string, the subject may contain binary zero
|
||||
bytes. When the starting offset is zero, the search for a match starts at the
|
||||
beginning of the subject, and this is by far the most common case.
|
||||
<i>subject</i>, a length (in bytes) in <i>length</i>, and a starting byte offset
|
||||
in <i>startoffset</i>. In UTF-8 mode, the byte offset must point to the start of
|
||||
a UTF-8 character. Unlike the pattern string, the subject may contain binary
|
||||
zero bytes. When the starting offset is zero, the search for a match starts at
|
||||
the beginning of the subject, and this is by far the most common case.
|
||||
</P>
|
||||
<P>
|
||||
A non-zero starting offset is useful when searching for another match in the
|
||||
@@ -1238,32 +1432,36 @@ a fragment of a pattern that picks out a substring. PCRE supports several other
|
||||
kinds of parenthesized subpattern that do not cause substrings to be captured.
|
||||
</P>
|
||||
<P>
|
||||
Captured substrings are returned to the caller via a vector of integer offsets
|
||||
whose address is passed in <i>ovector</i>. The number of elements in the vector
|
||||
is passed in <i>ovecsize</i>, which must be a non-negative number. <b>Note</b>:
|
||||
this argument is NOT the size of <i>ovector</i> in bytes.
|
||||
Captured substrings are returned to the caller via a vector of integers whose
|
||||
address is passed in <i>ovector</i>. The number of elements in the vector is
|
||||
passed in <i>ovecsize</i>, which must be a non-negative number. <b>Note</b>: this
|
||||
argument is NOT the size of <i>ovector</i> in bytes.
|
||||
</P>
|
||||
<P>
|
||||
The first two-thirds of the vector is used to pass back captured substrings,
|
||||
each substring using a pair of integers. The remaining third of the vector is
|
||||
used as workspace by <b>pcre_exec()</b> while matching capturing subpatterns,
|
||||
and is not available for passing back information. The length passed in
|
||||
and is not available for passing back information. The number passed in
|
||||
<i>ovecsize</i> should always be a multiple of three. If it is not, it is
|
||||
rounded down.
|
||||
</P>
|
||||
<P>
|
||||
When a match is successful, information about captured substrings is returned
|
||||
in pairs of integers, starting at the beginning of <i>ovector</i>, and
|
||||
continuing up to two-thirds of its length at the most. The first element of a
|
||||
pair is set to the offset of the first character in a substring, and the second
|
||||
is set to the offset of the first character after the end of a substring. The
|
||||
first pair, <i>ovector[0]</i> and <i>ovector[1]</i>, identify the portion of the
|
||||
subject string matched by the entire pattern. The next pair is used for the
|
||||
first capturing subpattern, and so on. The value returned by <b>pcre_exec()</b>
|
||||
is one more than the highest numbered pair that has been set. For example, if
|
||||
two substrings have been captured, the returned value is 3. If there are no
|
||||
capturing subpatterns, the return value from a successful match is 1,
|
||||
indicating that just the first pair of offsets has been set.
|
||||
continuing up to two-thirds of its length at the most. The first element of
|
||||
each pair is set to the byte offset of the first character in a substring, and
|
||||
the second is set to the byte offset of the first character after the end of a
|
||||
substring. <b>Note</b>: these values are always byte offsets, even in UTF-8
|
||||
mode. They are not character counts.
|
||||
</P>
|
||||
<P>
|
||||
The first pair of integers, <i>ovector[0]</i> and <i>ovector[1]</i>, identify the
|
||||
portion of the subject string matched by the entire pattern. The next pair is
|
||||
used for the first capturing subpattern, and so on. The value returned by
|
||||
<b>pcre_exec()</b> is one more than the highest numbered pair that has been set.
|
||||
For example, if two substrings have been captured, the returned value is 3. If
|
||||
there are no capturing subpatterns, the return value from a successful match is
|
||||
1, indicating that just the first pair of offsets has been set.
|
||||
</P>
|
||||
<P>
|
||||
If a capturing subpattern is matched repeatedly, it is the last portion of the
|
||||
@@ -1272,8 +1470,8 @@ string that it matched that is returned.
|
||||
<P>
|
||||
If the vector is too small to hold all the captured substring offsets, it is
|
||||
used as far as possible (up to two-thirds of its length), and the function
|
||||
returns a value of zero. In particular, if the substring offsets are not of
|
||||
interest, <b>pcre_exec()</b> may be called with <i>ovector</i> passed as NULL and
|
||||
returns a value of zero. If the substring offsets are not of interest,
|
||||
<b>pcre_exec()</b> may be called with <i>ovector</i> passed as NULL and
|
||||
<i>ovecsize</i> as zero. However, if the pattern contains back references and
|
||||
the <i>ovector</i> is not big enough to remember the related substrings, PCRE
|
||||
has to get additional memory for use during matching. Thus it is usually
|
||||
@@ -1334,7 +1532,7 @@ compiled in an environment of one endianness is run in an environment with the
|
||||
other endianness. This is the error that PCRE gives when the magic number is
|
||||
not present.
|
||||
<pre>
|
||||
PCRE_ERROR_UNKNOWN_NODE (-5)
|
||||
PCRE_ERROR_UNKNOWN_OPCODE (-5)
|
||||
</pre>
|
||||
While running the pattern match, an unknown item was encountered in the
|
||||
compiled pattern. This error could be caused by a bug in PCRE or by overwriting
|
||||
@@ -1359,12 +1557,6 @@ below). It is never returned by <b>pcre_exec()</b>.
|
||||
The backtracking limit, as specified by the <i>match_limit</i> field in a
|
||||
<b>pcre_extra</b> structure (or defaulted) was reached. See the description
|
||||
above.
|
||||
<pre>
|
||||
PCRE_ERROR_RECURSIONLIMIT (-21)
|
||||
</pre>
|
||||
The internal recursion limit, as specified by the <i>match_limit_recursion</i>
|
||||
field in a <b>pcre_extra</b> structure (or defaulted) was reached. See the
|
||||
description above.
|
||||
<pre>
|
||||
PCRE_ERROR_CALLOUT (-9)
|
||||
</pre>
|
||||
@@ -1403,6 +1595,19 @@ in PCRE or by overwriting of the compiled pattern.
|
||||
PCRE_ERROR_BADCOUNT (-15)
|
||||
</pre>
|
||||
This error is given if the value of the <i>ovecsize</i> argument is negative.
|
||||
<pre>
|
||||
PCRE_ERROR_RECURSIONLIMIT (-21)
|
||||
</pre>
|
||||
The internal recursion limit, as specified by the <i>match_limit_recursion</i>
|
||||
field in a <b>pcre_extra</b> structure (or defaulted) was reached. See the
|
||||
description above.
|
||||
<pre>
|
||||
PCRE_ERROR_BADNEWLINE (-23)
|
||||
</pre>
|
||||
An invalid combination of PCRE_NEWLINE_<i>xxx</i> options was given.
|
||||
</P>
|
||||
<P>
|
||||
Error numbers -16 to -20 and -22 are not used by <b>pcre_exec()</b>.
|
||||
</P>
|
||||
<br><a name="SEC15" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br>
|
||||
<P>
|
||||
@@ -1457,7 +1662,7 @@ the string is placed in <i>buffer</i>, whose length is given by
|
||||
<i>buffersize</i>, while for <b>pcre_get_substring()</b> a new block of memory is
|
||||
obtained via <b>pcre_malloc</b>, and its address is returned via
|
||||
<i>stringptr</i>. The yield of the function is the length of the string, not
|
||||
including the terminating zero, or one of
|
||||
including the terminating zero, or one of these error codes:
|
||||
<pre>
|
||||
PCRE_ERROR_NOMEMORY (-6)
|
||||
</pre>
|
||||
@@ -1474,7 +1679,7 @@ and builds a list of pointers to them. All this is done in a single block of
|
||||
memory that is obtained via <b>pcre_malloc</b>. The address of the memory block
|
||||
is returned via <i>listptr</i>, which is also the start of the list of string
|
||||
pointers. The end of the list is marked by a NULL pointer. The yield of the
|
||||
function is zero if all went well, or
|
||||
function is zero if all went well, or the error code
|
||||
<pre>
|
||||
PCRE_ERROR_NOMEMORY (-6)
|
||||
</pre>
|
||||
@@ -1520,7 +1725,7 @@ provided.
|
||||
To extract a substring by name, you first have to find associated number.
|
||||
For example, for this pattern
|
||||
<pre>
|
||||
(a+)b(?P<xxx>\d+)...
|
||||
(a+)b(?<xxx>\d+)...
|
||||
</pre>
|
||||
the number of the subpattern called "xxx" is 2. If the name is known to be
|
||||
unique (PCRE_DUPNAMES was not set), you can find the number from the name by
|
||||
@@ -1548,8 +1753,15 @@ translation table.
|
||||
</P>
|
||||
<P>
|
||||
These functions call <b>pcre_get_stringnumber()</b>, and if it succeeds, they
|
||||
then call <i>pcre_copy_substring()</i> or <i>pcre_get_substring()</i>, as
|
||||
appropriate.
|
||||
then call <b>pcre_copy_substring()</b> or <b>pcre_get_substring()</b>, as
|
||||
appropriate. <b>NOTE:</b> If PCRE_DUPNAMES is set and there are duplicate names,
|
||||
the behaviour may not be what you want (see the next section).
|
||||
</P>
|
||||
<P>
|
||||
<b>Warning:</b> If the pattern uses the "(?|" feature to set up multiple
|
||||
subpatterns with the same number, you cannot use names to distinguish them,
|
||||
because names are not included in the compiled code. The matching process uses
|
||||
only numbers.
|
||||
</P>
|
||||
<br><a name="SEC17" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
||||
<P>
|
||||
@@ -1562,23 +1774,27 @@ are not required to be unique. Normally, patterns with duplicate names are such
|
||||
that in any one match, only one of the named subpatterns participates. An
|
||||
example is shown in the
|
||||
<a href="pcrepattern.html"><b>pcrepattern</b></a>
|
||||
documentation. When duplicates are present, <b>pcre_copy_named_substring()</b>
|
||||
and <b>pcre_get_named_substring()</b> return the first substring corresponding
|
||||
to the given name that is set. If none are set, an empty string is returned.
|
||||
The <b>pcre_get_stringnumber()</b> function returns one of the numbers that are
|
||||
associated with the name, but it is not defined which it is.
|
||||
<br>
|
||||
<br>
|
||||
documentation.
|
||||
</P>
|
||||
<P>
|
||||
When duplicates are present, <b>pcre_copy_named_substring()</b> and
|
||||
<b>pcre_get_named_substring()</b> return the first substring corresponding to
|
||||
the given name that is set. If none are set, PCRE_ERROR_NOSUBSTRING (-7) is
|
||||
returned; no data is returned. The <b>pcre_get_stringnumber()</b> function
|
||||
returns one of the numbers that are associated with the name, but it is not
|
||||
defined which it is.
|
||||
</P>
|
||||
<P>
|
||||
If you want to get full details of all captured substrings for a given name,
|
||||
you must use the <b>pcre_get_stringtable_entries()</b> function. The first
|
||||
argument is the compiled pattern, and the second is the name. The third and
|
||||
fourth are pointers to variables which are updated by the function. After it
|
||||
has run, they point to the first and last entries in the name-to-number table
|
||||
for the given name. The function itself returns the length of each entry, or
|
||||
PCRE_ERROR_NOSUBSTRING if there are none. The format of the table is described
|
||||
above in the section entitled <i>Information about a pattern</i>. Given all the
|
||||
relevant entries for the name, you can extract each of their numbers, and hence
|
||||
the captured data, if any.
|
||||
PCRE_ERROR_NOSUBSTRING (-7) if there are none. The format of the table is
|
||||
described above in the section entitled <i>Information about a pattern</i>.
|
||||
Given all the relevant entries for the name, you can extract each of their
|
||||
numbers, and hence the captured data, if any.
|
||||
</P>
|
||||
<br><a name="SEC18" href="#TOC1">FINDING ALL POSSIBLE MATCHES</a><br>
|
||||
<P>
|
||||
@@ -1608,11 +1824,12 @@ will yield PCRE_ERROR_NOMATCH.
|
||||
</P>
|
||||
<P>
|
||||
The function <b>pcre_dfa_exec()</b> is called to match a subject string against
|
||||
a compiled pattern, using a "DFA" matching algorithm. This has different
|
||||
characteristics to the normal algorithm, and is not compatible with Perl. Some
|
||||
of the features of PCRE patterns are not supported. Nevertheless, there are
|
||||
times when this kind of matching can be useful. For a discussion of the two
|
||||
matching algorithms, see the
|
||||
a compiled pattern, using a matching algorithm that scans the subject string
|
||||
just once, and does not backtrack. This has different characteristics to the
|
||||
normal algorithm, and is not compatible with Perl. Some of the features of PCRE
|
||||
patterns are not supported. Nevertheless, there are times when this kind of
|
||||
matching can be useful. For a discussion of the two matching algorithms, see
|
||||
the
|
||||
<a href="pcrematching.html"><b>pcrematching</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
@@ -1671,9 +1888,9 @@ matching string.
|
||||
PCRE_DFA_SHORTEST
|
||||
</pre>
|
||||
Setting the PCRE_DFA_SHORTEST option causes the matching algorithm to stop as
|
||||
soon as it has found one match. Because of the way the DFA algorithm works,
|
||||
this is necessarily the shortest possible match at the first possible matching
|
||||
point in the subject string.
|
||||
soon as it has found one match. Because of the way the alternative algorithm
|
||||
works, this is necessarily the shortest possible match at the first possible
|
||||
matching point in the subject string.
|
||||
<pre>
|
||||
PCRE_DFA_RESTART
|
||||
</pre>
|
||||
@@ -1711,10 +1928,10 @@ the three matched strings are
|
||||
On success, the yield of the function is a number greater than zero, which is
|
||||
the number of matched substrings. The substrings themselves are returned in
|
||||
<i>ovector</i>. Each string uses two elements; the first is the offset to the
|
||||
start, and the second is the offset to the end. All the strings have the same
|
||||
start offset. (Space could have been saved by giving this only once, but it was
|
||||
decided to retain some compatibility with the way <b>pcre_exec()</b> returns
|
||||
data, even though the meaning of the strings is different.)
|
||||
start, and the second is the offset to the end. In fact, all the strings have
|
||||
the same start offset. (Space could have been saved by giving this only once,
|
||||
but it was decided to retain some compatibility with the way <b>pcre_exec()</b>
|
||||
returns data, even though the meaning of the strings is different.)
|
||||
</P>
|
||||
<P>
|
||||
The strings are returned in reverse order of length; that is, the longest
|
||||
@@ -1740,8 +1957,9 @@ that it does not support, for instance, the use of \C or a back reference.
|
||||
<pre>
|
||||
PCRE_ERROR_DFA_UCOND (-17)
|
||||
</pre>
|
||||
This return is given if <b>pcre_dfa_exec()</b> encounters a condition item in a
|
||||
pattern that uses a back reference for the condition. This is not supported.
|
||||
This return is given if <b>pcre_dfa_exec()</b> encounters a condition item that
|
||||
uses a back reference for the condition, or a test for recursion in a specific
|
||||
group. These are not supported.
|
||||
<pre>
|
||||
PCRE_ERROR_DFA_UMLIMIT (-18)
|
||||
</pre>
|
||||
@@ -1761,10 +1979,27 @@ recursively, using private vectors for <i>ovector</i> and <i>workspace</i>. This
|
||||
error is given if the output vector is not large enough. This should be
|
||||
extremely rare, as a vector of size 1000 is used.
|
||||
</P>
|
||||
<br><a name="SEC20" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
Last updated: 08 June 2006
|
||||
<b>pcrebuild</b>(3), <b>pcrecallout</b>(3), <b>pcrecpp(3)</b>(3),
|
||||
<b>pcrematching</b>(3), <b>pcrepartial</b>(3), <b>pcreposix</b>(3),
|
||||
<b>pcreprecompile</b>(3), <b>pcresample</b>(3), <b>pcrestack</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
<br>
|
||||
Cambridge CB2 3QH, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC22" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 11 April 2009
|
||||
<br>
|
||||
Copyright © 1997-2009 University of Cambridge.
|
||||
<br>
|
||||
Copyright © 1997-2006 University of Cambridge.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
|
||||
@@ -18,26 +18,39 @@ man page, in case the conversion went wrong.
|
||||
<li><a name="TOC3" href="#SEC3">UTF-8 SUPPORT</a>
|
||||
<li><a name="TOC4" href="#SEC4">UNICODE CHARACTER PROPERTY SUPPORT</a>
|
||||
<li><a name="TOC5" href="#SEC5">CODE VALUE OF NEWLINE</a>
|
||||
<li><a name="TOC6" href="#SEC6">BUILDING SHARED AND STATIC LIBRARIES</a>
|
||||
<li><a name="TOC7" href="#SEC7">POSIX MALLOC USAGE</a>
|
||||
<li><a name="TOC8" href="#SEC8">HANDLING VERY LARGE PATTERNS</a>
|
||||
<li><a name="TOC9" href="#SEC9">AVOIDING EXCESSIVE STACK USAGE</a>
|
||||
<li><a name="TOC10" href="#SEC10">LIMITING PCRE RESOURCE USAGE</a>
|
||||
<li><a name="TOC11" href="#SEC11">USING EBCDIC CODE</a>
|
||||
<li><a name="TOC6" href="#SEC6">WHAT \R MATCHES</a>
|
||||
<li><a name="TOC7" href="#SEC7">BUILDING SHARED AND STATIC LIBRARIES</a>
|
||||
<li><a name="TOC8" href="#SEC8">POSIX MALLOC USAGE</a>
|
||||
<li><a name="TOC9" href="#SEC9">HANDLING VERY LARGE PATTERNS</a>
|
||||
<li><a name="TOC10" href="#SEC10">AVOIDING EXCESSIVE STACK USAGE</a>
|
||||
<li><a name="TOC11" href="#SEC11">LIMITING PCRE RESOURCE USAGE</a>
|
||||
<li><a name="TOC12" href="#SEC12">CREATING CHARACTER TABLES AT BUILD TIME</a>
|
||||
<li><a name="TOC13" href="#SEC13">USING EBCDIC CODE</a>
|
||||
<li><a name="TOC14" href="#SEC14">PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT</a>
|
||||
<li><a name="TOC15" href="#SEC15">PCRETEST OPTION FOR LIBREADLINE SUPPORT</a>
|
||||
<li><a name="TOC16" href="#SEC16">SEE ALSO</a>
|
||||
<li><a name="TOC17" href="#SEC17">AUTHOR</a>
|
||||
<li><a name="TOC18" href="#SEC18">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">PCRE BUILD-TIME OPTIONS</a><br>
|
||||
<P>
|
||||
This document describes the optional features of PCRE that can be selected when
|
||||
the library is compiled. They are all selected, or deselected, by providing
|
||||
options to the <b>configure</b> script that is run before the <b>make</b>
|
||||
command. The complete list of options for <b>configure</b> (which includes the
|
||||
standard ones such as the selection of the installation directory) can be
|
||||
obtained by running
|
||||
the library is compiled. It assumes use of the <b>configure</b> script, where
|
||||
the optional features are selected or deselected by providing options to
|
||||
<b>configure</b> before running the <b>make</b> command. However, the same
|
||||
options can be selected in both Unix-like and non-Unix-like environments using
|
||||
the GUI facility of <b>CMakeSetup</b> if you are using <b>CMake</b> instead of
|
||||
<b>configure</b> to build PCRE.
|
||||
</P>
|
||||
<P>
|
||||
The complete list of options for <b>configure</b> (which includes the standard
|
||||
ones such as the selection of the installation directory) can be obtained by
|
||||
running
|
||||
<pre>
|
||||
./configure --help
|
||||
</pre>
|
||||
The following sections describe certain options whose names begin with --enable
|
||||
or --disable. These settings specify changes to the defaults for the
|
||||
The following sections include descriptions of options whose names begin with
|
||||
--enable or --disable. These settings specify changes to the defaults for the
|
||||
<b>configure</b> command. Because of the way that <b>configure</b> works,
|
||||
--enable and --disable always come in pairs, so the complementary option always
|
||||
exists as well, but as it specifies the default, it is not described.
|
||||
@@ -54,7 +67,7 @@ to the <b>configure</b> command.
|
||||
</P>
|
||||
<br><a name="SEC3" href="#TOC1">UTF-8 SUPPORT</a><br>
|
||||
<P>
|
||||
To build PCRE with support for UTF-8 character strings, add
|
||||
To build PCRE with support for UTF-8 Unicode character strings, add
|
||||
<pre>
|
||||
--enable-utf8
|
||||
</pre>
|
||||
@@ -63,6 +76,13 @@ strings as UTF-8. As well as compiling PCRE with this option, you also have
|
||||
have to set the PCRE_UTF8 option when you call the <b>pcre_compile()</b>
|
||||
function.
|
||||
</P>
|
||||
<P>
|
||||
If you set --enable-utf8 when compiling in an EBCDIC environment, PCRE expects
|
||||
its input to be either ASCII or UTF-8 (depending on the runtime option). It is
|
||||
not possible to support both EBCDIC and UTF-8 codes in the same version of the
|
||||
library. Consequently, --enable-utf8 and --enable-ebcdic are mutually
|
||||
exclusive.
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">UNICODE CHARACTER PROPERTY SUPPORT</a><br>
|
||||
<P>
|
||||
UTF-8 support allows PCRE to process character values greater than 255 in the
|
||||
@@ -77,17 +97,17 @@ to the <b>configure</b> command. This implies UTF-8 support, even if you have
|
||||
not explicitly requested it.
|
||||
</P>
|
||||
<P>
|
||||
Including Unicode property support adds around 90K of tables to the PCRE
|
||||
library, approximately doubling its size. Only the general category properties
|
||||
such as <i>Lu</i> and <i>Nd</i> are supported. Details are given in the
|
||||
Including Unicode property support adds around 30K of tables to the PCRE
|
||||
library. Only the general category properties such as <i>Lu</i> and <i>Nd</i> are
|
||||
supported. Details are given in the
|
||||
<a href="pcrepattern.html"><b>pcrepattern</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">CODE VALUE OF NEWLINE</a><br>
|
||||
<P>
|
||||
By default, PCRE interprets character 10 (linefeed, LF) as indicating the end
|
||||
By default, PCRE interprets the linefeed (LF) character as indicating the end
|
||||
of a line. This is the normal newline character on Unix-like systems. You can
|
||||
compile PCRE to use character 13 (carriage return, CR) instead, by adding
|
||||
compile PCRE to use carriage return (CR) instead, by adding
|
||||
<pre>
|
||||
--enable-newline-is-cr
|
||||
</pre>
|
||||
@@ -100,11 +120,34 @@ character sequence CRLF. If you want this, add
|
||||
<pre>
|
||||
--enable-newline-is-crlf
|
||||
</pre>
|
||||
to the <b>configure</b> command. Whatever line ending convention is selected
|
||||
when PCRE is built can be overridden when the library functions are called. At
|
||||
build time it is conventional to use the standard for your operating system.
|
||||
to the <b>configure</b> command. There is a fourth option, specified by
|
||||
<pre>
|
||||
--enable-newline-is-anycrlf
|
||||
</pre>
|
||||
which causes PCRE to recognize any of the three sequences CR, LF, or CRLF as
|
||||
indicating a line ending. Finally, a fifth option, specified by
|
||||
<pre>
|
||||
--enable-newline-is-any
|
||||
</pre>
|
||||
causes PCRE to recognize any Unicode newline sequence.
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">BUILDING SHARED AND STATIC LIBRARIES</a><br>
|
||||
<P>
|
||||
Whatever line ending convention is selected when PCRE is built can be
|
||||
overridden when the library functions are called. At build time it is
|
||||
conventional to use the standard for your operating system.
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||
<P>
|
||||
By default, the sequence \R in a pattern matches any Unicode newline sequence,
|
||||
whatever has been selected as the line ending sequence. If you specify
|
||||
<pre>
|
||||
--enable-bsr-anycrlf
|
||||
</pre>
|
||||
the default is changed so that \R matches only CR, LF, or CRLF. Whatever is
|
||||
selected when PCRE is built can be overridden when the library functions are
|
||||
called.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">BUILDING SHARED AND STATIC LIBRARIES</a><br>
|
||||
<P>
|
||||
The PCRE building process uses <b>libtool</b> to build both shared and static
|
||||
Unix libraries by default. You can suppress one of these by adding one of
|
||||
@@ -114,7 +157,7 @@ Unix libraries by default. You can suppress one of these by adding one of
|
||||
</pre>
|
||||
to the <b>configure</b> command, as required.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">POSIX MALLOC USAGE</a><br>
|
||||
<br><a name="SEC8" href="#TOC1">POSIX MALLOC USAGE</a><br>
|
||||
<P>
|
||||
When PCRE is called through the POSIX interface (see the
|
||||
<a href="pcreposix.html"><b>pcreposix</b></a>
|
||||
@@ -130,7 +173,7 @@ such as
|
||||
</pre>
|
||||
to the <b>configure</b> command.
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br>
|
||||
<br><a name="SEC9" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br>
|
||||
<P>
|
||||
Within a compiled pattern, offset values are used to point from one part to
|
||||
another (for example, from an opening parenthesis to an alternation
|
||||
@@ -146,12 +189,7 @@ to the <b>configure</b> command. The value given must be 2, 3, or 4. Using
|
||||
longer offsets slows down the operation of PCRE because it has to load
|
||||
additional bytes when handling them.
|
||||
</P>
|
||||
<P>
|
||||
If you build PCRE with an increased link size, test 2 (and test 5 if you are
|
||||
using UTF-8) will fail. Part of the output of these tests is a representation
|
||||
of the compiled pattern, and this changes with the link size.
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br>
|
||||
<br><a name="SEC10" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br>
|
||||
<P>
|
||||
When matching with the <b>pcre_exec()</b> function, PCRE implements backtracking
|
||||
by making recursive calls to an internal function called <b>match()</b>. In
|
||||
@@ -169,15 +207,20 @@ build a version of PCRE that works this way, add
|
||||
</pre>
|
||||
to the <b>configure</b> command. With this configuration, PCRE will use the
|
||||
<b>pcre_stack_malloc</b> and <b>pcre_stack_free</b> variables to call memory
|
||||
management functions. Separate functions are provided because the usage is very
|
||||
predictable: the block sizes requested are always the same, and the blocks are
|
||||
always freed in reverse order. A calling program might be able to implement
|
||||
optimized functions that perform better than the standard <b>malloc()</b> and
|
||||
<b>free()</b> functions. PCRE runs noticeably more slowly when built in this
|
||||
way. This option affects only the <b>pcre_exec()</b> function; it is not
|
||||
relevant for the the <b>pcre_dfa_exec()</b> function.
|
||||
management functions. By default these point to <b>malloc()</b> and
|
||||
<b>free()</b>, but you can replace the pointers so that your own functions are
|
||||
used.
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">LIMITING PCRE RESOURCE USAGE</a><br>
|
||||
<P>
|
||||
Separate functions are provided rather than using <b>pcre_malloc</b> and
|
||||
<b>pcre_free</b> because the usage is very predictable: the block sizes
|
||||
requested are always the same, and the blocks are always freed in reverse
|
||||
order. A calling program might be able to implement optimized functions that
|
||||
perform better than <b>malloc()</b> and <b>free()</b>. PCRE runs noticeably more
|
||||
slowly when built in this way. This option affects only the <b>pcre_exec()</b>
|
||||
function; it is not relevant for the the <b>pcre_dfa_exec()</b> function.
|
||||
</P>
|
||||
<br><a name="SEC11" href="#TOC1">LIMITING PCRE RESOURCE USAGE</a><br>
|
||||
<P>
|
||||
Internally, PCRE has a function called <b>match()</b>, which it calls repeatedly
|
||||
(sometimes recursively) when matching a pattern with the <b>pcre_exec()</b>
|
||||
@@ -206,20 +249,100 @@ constraints. However, you can set a lower limit by adding, for example,
|
||||
</pre>
|
||||
to the <b>configure</b> command. This value can also be overridden at run time.
|
||||
</P>
|
||||
<br><a name="SEC11" href="#TOC1">USING EBCDIC CODE</a><br>
|
||||
<br><a name="SEC12" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
|
||||
<P>
|
||||
PCRE uses fixed tables for processing characters whose code values are less
|
||||
than 256. By default, PCRE is built with a set of tables that are distributed
|
||||
in the file <i>pcre_chartables.c.dist</i>. These tables are for ASCII codes
|
||||
only. If you add
|
||||
<pre>
|
||||
--enable-rebuild-chartables
|
||||
</pre>
|
||||
to the <b>configure</b> command, the distributed tables are no longer used.
|
||||
Instead, a program called <b>dftables</b> is compiled and run. This outputs the
|
||||
source for new set of tables, created in the default locale of your C runtime
|
||||
system. (This method of replacing the tables does not work if you are cross
|
||||
compiling, because <b>dftables</b> is run on the local host. If you need to
|
||||
create alternative tables when cross compiling, you will have to do so "by
|
||||
hand".)
|
||||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">USING EBCDIC CODE</a><br>
|
||||
<P>
|
||||
PCRE assumes by default that it will run in an environment where the character
|
||||
code is ASCII (or Unicode, which is a superset of ASCII). PCRE can, however, be
|
||||
compiled to run in an EBCDIC environment by adding
|
||||
code is ASCII (or Unicode, which is a superset of ASCII). This is the case for
|
||||
most computer operating systems. PCRE can, however, be compiled to run in an
|
||||
EBCDIC environment by adding
|
||||
<pre>
|
||||
--enable-ebcdic
|
||||
</pre>
|
||||
to the <b>configure</b> command.
|
||||
to the <b>configure</b> command. This setting implies
|
||||
--enable-rebuild-chartables. You should only use it if you know that you are in
|
||||
an EBCDIC environment (for example, an IBM mainframe operating system). The
|
||||
--enable-ebcdic option is incompatible with --enable-utf8.
|
||||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br>
|
||||
<P>
|
||||
By default, <b>pcregrep</b> reads all files as plain text. You can build it so
|
||||
that it recognizes files whose names end in <b>.gz</b> or <b>.bz2</b>, and reads
|
||||
them with <b>libz</b> or <b>libbz2</b>, respectively, by adding one or both of
|
||||
<pre>
|
||||
--enable-pcregrep-libz
|
||||
--enable-pcregrep-libbz2
|
||||
</pre>
|
||||
to the <b>configure</b> command. These options naturally require that the
|
||||
relevant libraries are installed on your system. Configuration will fail if
|
||||
they are not.
|
||||
</P>
|
||||
<br><a name="SEC15" href="#TOC1">PCRETEST OPTION FOR LIBREADLINE SUPPORT</a><br>
|
||||
<P>
|
||||
If you add
|
||||
<pre>
|
||||
--enable-pcretest-libreadline
|
||||
</pre>
|
||||
to the <b>configure</b> command, <b>pcretest</b> is linked with the
|
||||
<b>libreadline</b> library, and when its input is from a terminal, it reads it
|
||||
using the <b>readline()</b> function. This provides line-editing and history
|
||||
facilities. Note that <b>libreadline</b> is GPL-licenced, so if you distribute a
|
||||
binary of <b>pcretest</b> linked in this way, there may be licensing issues.
|
||||
</P>
|
||||
<P>
|
||||
Last updated: 06 June 2006
|
||||
Setting this option causes the <b>-lreadline</b> option to be added to the
|
||||
<b>pcretest</b> build. In many operating environments with a sytem-installed
|
||||
<b>libreadline</b> this is sufficient. However, in some environments (e.g.
|
||||
if an unmodified distribution version of readline is in use), some extra
|
||||
configuration may be necessary. The INSTALL file for <b>libreadline</b> says
|
||||
this:
|
||||
<pre>
|
||||
"Readline uses the termcap functions, but does not link with the
|
||||
termcap or curses library itself, allowing applications which link
|
||||
with readline the to choose an appropriate library."
|
||||
</pre>
|
||||
If your environment has not been set up so that an appropriate library is
|
||||
automatically included, you may need to add something like
|
||||
<pre>
|
||||
LIBS="-ncurses"
|
||||
</pre>
|
||||
immediately before the <b>configure</b> command.
|
||||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcreapi</b>(3), <b>pcre_config</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC17" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
<br>
|
||||
Cambridge CB2 3QH, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC18" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 17 March 2009
|
||||
<br>
|
||||
Copyright © 1997-2009 University of Cambridge.
|
||||
<br>
|
||||
Copyright © 1997-2006 University of Cambridge.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
|
||||
@@ -17,6 +17,8 @@ man page, in case the conversion went wrong.
|
||||
<li><a name="TOC2" href="#SEC2">MISSING CALLOUTS</a>
|
||||
<li><a name="TOC3" href="#SEC3">THE CALLOUT INTERFACE</a>
|
||||
<li><a name="TOC4" href="#SEC4">RETURN VALUES</a>
|
||||
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
|
||||
<li><a name="TOC6" href="#SEC6">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">PCRE CALLOUTS</a><br>
|
||||
<P>
|
||||
@@ -35,7 +37,7 @@ function is to be called. Different callout points can be identified by putting
|
||||
a number less than 256 after the letter C. The default value is zero.
|
||||
For example, this pattern has two callout points:
|
||||
<pre>
|
||||
(?C1)\deabc(?C2)def
|
||||
(?C1)abc(?C2)def
|
||||
</pre>
|
||||
If the PCRE_AUTO_CALLOUT option bit is set when <b>pcre_compile()</b> is called,
|
||||
PCRE automatically inserts callouts, all with number 255, before each item in
|
||||
@@ -60,7 +62,8 @@ trying to optimize the performance of a particular pattern.
|
||||
<br><a name="SEC2" href="#TOC1">MISSING CALLOUTS</a><br>
|
||||
<P>
|
||||
You should be aware that, because of optimizations in the way PCRE matches
|
||||
patterns, callouts sometimes do not happen. For example, if the pattern is
|
||||
patterns by default, callouts sometimes do not happen. For example, if the
|
||||
pattern is
|
||||
<pre>
|
||||
ab(?C4)cd
|
||||
</pre>
|
||||
@@ -69,6 +72,12 @@ string is "abyz", the lack of "d" means that matching doesn't ever start, and
|
||||
the callout is never reached. However, with "abyd", though the result is still
|
||||
no match, the callout is obeyed.
|
||||
</P>
|
||||
<P>
|
||||
You can disable these optimizations by passing the PCRE_NO_START_OPTIMIZE
|
||||
option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>. This slows down the
|
||||
matching process, but does ensure that callouts such as the example above are
|
||||
obeyed.
|
||||
</P>
|
||||
<br><a name="SEC3" href="#TOC1">THE CALLOUT INTERFACE</a><br>
|
||||
<P>
|
||||
During matching, when PCRE reaches a callout point, the external function
|
||||
@@ -113,10 +122,12 @@ The <i>subject</i> and <i>subject_length</i> fields contain copies of the values
|
||||
that were passed to <b>pcre_exec()</b>.
|
||||
</P>
|
||||
<P>
|
||||
The <i>start_match</i> field contains the offset within the subject at which the
|
||||
current match attempt started. If the pattern is not anchored, the callout
|
||||
function may be called several times from the same point in the pattern for
|
||||
different starting points in the subject.
|
||||
The <i>start_match</i> field normally contains the offset within the subject at
|
||||
which the current match attempt started. However, if the escape sequence \K
|
||||
has been encountered, this value is changed to reflect the modified starting
|
||||
point. If the pattern is not anchored, the callout function may be called
|
||||
several times from the same point in the pattern for different starting points
|
||||
in the subject.
|
||||
</P>
|
||||
<P>
|
||||
The <i>current_position</i> field contains the offset within the subject of the
|
||||
@@ -177,10 +188,21 @@ values. In particular, PCRE_ERROR_NOMATCH forces a standard "no match" failure.
|
||||
The error number PCRE_ERROR_CALLOUT is reserved for use by callout functions;
|
||||
it will never be used by PCRE itself.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Last updated: 28 February 2005
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
<br>
|
||||
Cambridge CB2 3QH, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 15 March 2009
|
||||
<br>
|
||||
Copyright © 1997-2009 University of Cambridge.
|
||||
<br>
|
||||
Copyright © 1997-2005 University of Cambridge.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
|
||||
@@ -17,8 +17,9 @@ DIFFERENCES BETWEEN PCRE AND PERL
|
||||
</b><br>
|
||||
<P>
|
||||
This document describes the differences in the ways that PCRE and Perl handle
|
||||
regular expressions. The differences described here are with respect to Perl
|
||||
5.8.
|
||||
regular expressions. The differences described here are mainly with respect to
|
||||
Perl 5.8, though PCRE versions 7.0 and later contain some features that are
|
||||
expected to be in the forthcoming Perl 5.10.
|
||||
</P>
|
||||
<P>
|
||||
1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details of what
|
||||
@@ -76,20 +77,34 @@ following examples:
|
||||
The \Q...\E sequence is recognized both inside and outside character classes.
|
||||
</P>
|
||||
<P>
|
||||
8. Fairly obviously, PCRE does not support the (?{code}) and (?p{code})
|
||||
constructions. However, there is support for recursive patterns using the
|
||||
non-Perl items (?R), (?number), and (?P>name). Also, the PCRE "callout" feature
|
||||
allows an external function to be called during pattern matching. See the
|
||||
8. Fairly obviously, PCRE does not support the (?{code}) and (??{code})
|
||||
constructions. However, there is support for recursive patterns. This is not
|
||||
available in Perl 5.8, but will be in Perl 5.10. Also, the PCRE "callout"
|
||||
feature allows an external function to be called during pattern matching. See
|
||||
the
|
||||
<a href="pcrecallout.html"><b>pcrecallout</b></a>
|
||||
documentation for details.
|
||||
</P>
|
||||
<P>
|
||||
9. There are some differences that are concerned with the settings of captured
|
||||
9. Subpatterns that are called recursively or as "subroutines" are always
|
||||
treated as atomic groups in PCRE. This is like Python, but unlike Perl.
|
||||
</P>
|
||||
<P>
|
||||
10. There are some differences that are concerned with the settings of captured
|
||||
strings when part of a pattern is repeated. For example, matching "aba" against
|
||||
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE it is set to "b".
|
||||
</P>
|
||||
<P>
|
||||
10. PCRE provides some extensions to the Perl regular expression facilities:
|
||||
11. PCRE does support Perl 5.10's backtracking verbs (*ACCEPT), (*FAIL), (*F),
|
||||
(*COMMIT), (*PRUNE), (*SKIP), and (*THEN), but only in the forms without an
|
||||
argument. PCRE does not support (*MARK). If (*ACCEPT) is within capturing
|
||||
parentheses, PCRE does not set that capture group; this is different to Perl.
|
||||
</P>
|
||||
<P>
|
||||
12. PCRE provides some extensions to the Perl regular expression facilities.
|
||||
Perl 5.10 will include new features that are not in earlier versions, some of
|
||||
which (such as named parentheses) have been in PCRE for some time. This list is
|
||||
with respect to Perl 5.10:
|
||||
<br>
|
||||
<br>
|
||||
(a) Although lookbehind assertions must match fixed length strings, each
|
||||
@@ -102,8 +117,8 @@ meta-character matches only at the very end of the string.
|
||||
<br>
|
||||
<br>
|
||||
(c) If PCRE_EXTRA is set, a backslash followed by a letter with no special
|
||||
meaning is faulted. Otherwise, like Perl, the backslash is ignored. (Perl can
|
||||
be made to issue a warning.)
|
||||
meaning is faulted. Otherwise, like Perl, the backslash is quietly ignored.
|
||||
(Perl can be made to issue a warning.)
|
||||
<br>
|
||||
<br>
|
||||
(d) If PCRE_UNGREEDY is set, the greediness of the repetition quantifiers is
|
||||
@@ -119,38 +134,46 @@ only at the first matching position in the subject string.
|
||||
options for <b>pcre_exec()</b> have no Perl equivalents.
|
||||
<br>
|
||||
<br>
|
||||
(g) The (?R), (?number), and (?P>name) constructs allows for recursive pattern
|
||||
matching (Perl can do this using the (?p{code}) construct, which PCRE cannot
|
||||
support.)
|
||||
(g) The \R escape sequence can be restricted to match only CR, LF, or CRLF
|
||||
by the PCRE_BSR_ANYCRLF option.
|
||||
<br>
|
||||
<br>
|
||||
(h) PCRE supports named capturing substrings, using the Python syntax.
|
||||
(h) The callout facility is PCRE-specific.
|
||||
<br>
|
||||
<br>
|
||||
(i) PCRE supports the possessive quantifier "++" syntax, taken from Sun's Java
|
||||
package.
|
||||
(i) The partial matching facility is PCRE-specific.
|
||||
<br>
|
||||
<br>
|
||||
(j) The (R) condition, for testing recursion, is a PCRE extension.
|
||||
<br>
|
||||
<br>
|
||||
(k) The callout facility is PCRE-specific.
|
||||
<br>
|
||||
<br>
|
||||
(l) The partial matching facility is PCRE-specific.
|
||||
<br>
|
||||
<br>
|
||||
(m) Patterns compiled by PCRE can be saved and re-used at a later time, even on
|
||||
(j) Patterns compiled by PCRE can be saved and re-used at a later time, even on
|
||||
different hosts that have the other endianness.
|
||||
<br>
|
||||
<br>
|
||||
(n) The alternative matching function (<b>pcre_dfa_exec()</b>) matches in a
|
||||
(k) The alternative matching function (<b>pcre_dfa_exec()</b>) matches in a
|
||||
different way and is not Perl-compatible.
|
||||
</P>
|
||||
<P>
|
||||
Last updated: 06 June 2006
|
||||
<br>
|
||||
Copyright © 1997-2006 University of Cambridge.
|
||||
<br>
|
||||
(l) PCRE recognizes some special sequences such as (*CR) at the start of
|
||||
a pattern that set overall options that cannot be changed within the pattern.
|
||||
</P>
|
||||
<br><b>
|
||||
AUTHOR
|
||||
</b><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
<br>
|
||||
Cambridge CB2 3QH, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><b>
|
||||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 11 September 2007
|
||||
<br>
|
||||
Copyright © 1997-2007 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
|
||||
@@ -16,20 +16,20 @@ man page, in case the conversion went wrong.
|
||||
<li><a name="TOC1" href="#SEC1">SYNOPSIS OF C++ WRAPPER</a>
|
||||
<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
|
||||
<li><a name="TOC3" href="#SEC3">MATCHING INTERFACE</a>
|
||||
<li><a name="TOC4" href="#SEC4">PARTIAL MATCHES</a>
|
||||
<li><a name="TOC5" href="#SEC5">UTF-8 AND THE MATCHING INTERFACE</a>
|
||||
<li><a name="TOC6" href="#SEC6">PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE</a>
|
||||
<li><a name="TOC7" href="#SEC7">SCANNING TEXT INCREMENTALLY</a>
|
||||
<li><a name="TOC8" href="#SEC8">PARSING HEX/OCTAL/C-RADIX NUMBERS</a>
|
||||
<li><a name="TOC9" href="#SEC9">REPLACING PARTS OF STRINGS</a>
|
||||
<li><a name="TOC10" href="#SEC10">AUTHOR</a>
|
||||
<li><a name="TOC4" href="#SEC4">QUOTING METACHARACTERS</a>
|
||||
<li><a name="TOC5" href="#SEC5">PARTIAL MATCHES</a>
|
||||
<li><a name="TOC6" href="#SEC6">UTF-8 AND THE MATCHING INTERFACE</a>
|
||||
<li><a name="TOC7" href="#SEC7">PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE</a>
|
||||
<li><a name="TOC8" href="#SEC8">SCANNING TEXT INCREMENTALLY</a>
|
||||
<li><a name="TOC9" href="#SEC9">PARSING HEX/OCTAL/C-RADIX NUMBERS</a>
|
||||
<li><a name="TOC10" href="#SEC10">REPLACING PARTS OF STRINGS</a>
|
||||
<li><a name="TOC11" href="#SEC11">AUTHOR</a>
|
||||
<li><a name="TOC12" href="#SEC12">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">SYNOPSIS OF C++ WRAPPER</a><br>
|
||||
<P>
|
||||
<b>#include <pcrecpp.h></b>
|
||||
</P>
|
||||
<P>
|
||||
</P>
|
||||
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
|
||||
<P>
|
||||
The C++ wrapper for PCRE was provided by Google Inc. Some additional
|
||||
@@ -101,16 +101,43 @@ The function returns true iff all of the following conditions are satisfied:
|
||||
|
||||
c. The "i"th argument has a suitable type for holding the
|
||||
string captured as the "i"th sub-pattern. If you pass in
|
||||
NULL for the "i"th argument, or pass fewer arguments than
|
||||
void * NULL for the "i"th argument, or a non-void * NULL
|
||||
of the correct type, or pass fewer arguments than the
|
||||
number of sub-patterns, "i"th captured sub-pattern is
|
||||
ignored.
|
||||
</pre>
|
||||
CAVEAT: An optional sub-pattern that does not exist in the matched
|
||||
string is assigned the empty string. Therefore, the following will
|
||||
return false (because the empty string is not a valid number):
|
||||
<pre>
|
||||
int number;
|
||||
pcrecpp::RE::FullMatch("abc", "[a-z]+(\\d+)?", &number);
|
||||
</pre>
|
||||
The matching interface supports at most 16 arguments per call.
|
||||
If you need more, consider using the more general interface
|
||||
<b>pcrecpp::RE::DoMatch</b>. See <b>pcrecpp.h</b> for the signature for
|
||||
<b>DoMatch</b>.
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">PARTIAL MATCHES</a><br>
|
||||
<P>
|
||||
NOTE: Do not use <b>no_arg</b>, which is used internally to mark the end of a
|
||||
list of optional arguments, as a placeholder for missing arguments, as this can
|
||||
lead to segfaults.
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">QUOTING METACHARACTERS</a><br>
|
||||
<P>
|
||||
You can use the "QuoteMeta" operation to insert backslashes before all
|
||||
potentially meaningful characters in a string. The returned string, used as a
|
||||
regular expression, will exactly match the original string.
|
||||
<pre>
|
||||
Example:
|
||||
string quoted = RE::QuoteMeta(unquoted);
|
||||
</pre>
|
||||
Note that it's legal to escape a character even if it has no special meaning in
|
||||
a regular expression -- so this function does that. (This also makes it
|
||||
identical to the perl function of the same name; see "perldoc -f quotemeta".)
|
||||
For example, "1.5-2.0?" becomes "1\.5\-2\.0\?".
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">PARTIAL MATCHES</a><br>
|
||||
<P>
|
||||
You can use the "PartialMatch" operation when you want the pattern
|
||||
to match any substring of the text.
|
||||
@@ -125,7 +152,7 @@ to match any substring of the text.
|
||||
assert(number == 100);
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">UTF-8 AND THE MATCHING INTERFACE</a><br>
|
||||
<br><a name="SEC6" href="#TOC1">UTF-8 AND THE MATCHING INTERFACE</a><br>
|
||||
<P>
|
||||
By default, pattern and text are plain text, one byte per character. The UTF8
|
||||
flag, passed to the constructor, causes both pattern and string to be treated
|
||||
@@ -150,7 +177,7 @@ NOTE: The UTF8 flag is ignored if pcre was not configured with the
|
||||
--enable-utf8 flag.
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE</a><br>
|
||||
<br><a name="SEC7" href="#TOC1">PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE</a><br>
|
||||
<P>
|
||||
PCRE defines some modifiers to change the behavior of the regular expression
|
||||
engine. The C++ wrapper defines an auxiliary class, RE_Options, as a vehicle to
|
||||
@@ -244,7 +271,7 @@ PCRE_EXTENDED, and PCRE_MULTILINE to a RE with one statement, you may write:
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">SCANNING TEXT INCREMENTALLY</a><br>
|
||||
<br><a name="SEC8" href="#TOC1">SCANNING TEXT INCREMENTALLY</a><br>
|
||||
<P>
|
||||
The "Consume" operation may be useful if you want to repeatedly
|
||||
match regular expressions at the front of a string and skip over
|
||||
@@ -277,7 +304,7 @@ could extract all words from a string by repeatedly calling
|
||||
pcrecpp::RE("(\\w+)").FindAndConsume(&input, &word)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">PARSING HEX/OCTAL/C-RADIX NUMBERS</a><br>
|
||||
<br><a name="SEC9" href="#TOC1">PARSING HEX/OCTAL/C-RADIX NUMBERS</a><br>
|
||||
<P>
|
||||
By default, if you pass a pointer to a numeric value, the
|
||||
corresponding text is interpreted as a base-10 number. You can
|
||||
@@ -295,7 +322,7 @@ prefixes, but defaults to base-10.
|
||||
</pre>
|
||||
will leave 64 in a, b, c, and d.
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">REPLACING PARTS OF STRINGS</a><br>
|
||||
<br><a name="SEC10" href="#TOC1">REPLACING PARTS OF STRINGS</a><br>
|
||||
<P>
|
||||
You can replace the first match of "pattern" in "str" with "rewrite".
|
||||
Within "rewrite", backslash-escaped digits (\1 to \9) can be
|
||||
@@ -327,11 +354,17 @@ The non-matching portions of "text" are ignored. Returns true iff a match
|
||||
occurred and the extraction happened successfully; if no match occurs, the
|
||||
string is left unaffected.
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">AUTHOR</a><br>
|
||||
<br><a name="SEC11" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
The C++ wrapper was contributed by Google Inc.
|
||||
<br>
|
||||
Copyright © 2005 Google Inc.
|
||||
Copyright © 2007 Google Inc.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 17 March 2009
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
|
||||
@@ -15,14 +15,17 @@ man page, in case the conversion went wrong.
|
||||
<ul>
|
||||
<li><a name="TOC1" href="#SEC1">SYNOPSIS</a>
|
||||
<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
|
||||
<li><a name="TOC3" href="#SEC3">OPTIONS</a>
|
||||
<li><a name="TOC4" href="#SEC4">ENVIRONMENT VARIABLES</a>
|
||||
<li><a name="TOC5" href="#SEC5">NEWLINES</a>
|
||||
<li><a name="TOC6" href="#SEC6">OPTIONS COMPATIBILITY</a>
|
||||
<li><a name="TOC7" href="#SEC7">OPTIONS WITH DATA</a>
|
||||
<li><a name="TOC8" href="#SEC8">MATCHING ERRORS</a>
|
||||
<li><a name="TOC9" href="#SEC9">DIAGNOSTICS</a>
|
||||
<li><a name="TOC10" href="#SEC10">AUTHOR</a>
|
||||
<li><a name="TOC3" href="#SEC3">SUPPORT FOR COMPRESSED FILES</a>
|
||||
<li><a name="TOC4" href="#SEC4">OPTIONS</a>
|
||||
<li><a name="TOC5" href="#SEC5">ENVIRONMENT VARIABLES</a>
|
||||
<li><a name="TOC6" href="#SEC6">NEWLINES</a>
|
||||
<li><a name="TOC7" href="#SEC7">OPTIONS COMPATIBILITY</a>
|
||||
<li><a name="TOC8" href="#SEC8">OPTIONS WITH DATA</a>
|
||||
<li><a name="TOC9" href="#SEC9">MATCHING ERRORS</a>
|
||||
<li><a name="TOC10" href="#SEC10">DIAGNOSTICS</a>
|
||||
<li><a name="TOC11" href="#SEC11">SEE ALSO</a>
|
||||
<li><a name="TOC12" href="#SEC12">AUTHOR</a>
|
||||
<li><a name="TOC13" href="#SEC13">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
|
||||
<P>
|
||||
@@ -33,9 +36,9 @@ man page, in case the conversion went wrong.
|
||||
<b>pcregrep</b> searches files for character patterns, in the same way as other
|
||||
grep commands do, but it uses the PCRE regular expression library to support
|
||||
patterns that are compatible with the regular expressions of Perl 5. See
|
||||
<a href="pcrepattern.html"><b>pcrepattern</b></a>
|
||||
for a full description of syntax and semantics of the regular expressions that
|
||||
PCRE supports.
|
||||
<a href="pcrepattern.html"><b>pcrepattern</b>(3)</a>
|
||||
for a full description of syntax and semantics of the regular expressions
|
||||
that PCRE supports.
|
||||
</P>
|
||||
<P>
|
||||
Patterns, whether supplied on the command line or in a separate file, are given
|
||||
@@ -45,9 +48,9 @@ without delimiters. For example:
|
||||
</pre>
|
||||
If you attempt to use delimiters (for example, by surrounding a pattern with
|
||||
slashes, as is common in Perl scripts), they are interpreted as part of the
|
||||
pattern. Quotes can of course be used on the command line because they are
|
||||
interpreted by the shell, and indeed they are required if a pattern contains
|
||||
white space or shell metacharacters.
|
||||
pattern. Quotes can of course be used to delimit patterns on the command line
|
||||
because they are interpreted by the shell, and indeed they are required if a
|
||||
pattern contains white space or shell metacharacters.
|
||||
</P>
|
||||
<P>
|
||||
The first argument that follows any option settings is treated as the single
|
||||
@@ -63,23 +66,58 @@ For example:
|
||||
<pre>
|
||||
pcregrep some-pattern /file1 - /file3
|
||||
</pre>
|
||||
By default, each line that matches the pattern is copied to the standard
|
||||
By default, each line that matches a pattern is copied to the standard
|
||||
output, and if there is more than one file, the file name is output at the
|
||||
start of each line. However, there are options that can change how
|
||||
<b>pcregrep</b> behaves. In particular, the <b>-M</b> option makes it possible to
|
||||
search for patterns that span line boundaries. What defines a line boundary is
|
||||
controlled by the <b>-N</b> (<b>--newline</b>) option.
|
||||
start of each line, followed by a colon. However, there are options that can
|
||||
change how <b>pcregrep</b> behaves. In particular, the <b>-M</b> option makes it
|
||||
possible to search for patterns that span line boundaries. What defines a line
|
||||
boundary is controlled by the <b>-N</b> (<b>--newline</b>) option.
|
||||
</P>
|
||||
<P>
|
||||
Patterns are limited to 8K or BUFSIZ characters, whichever is the greater.
|
||||
BUFSIZ is defined in <b><stdio.h></b>.
|
||||
BUFSIZ is defined in <b><stdio.h></b>. When there is more than one pattern
|
||||
(specified by the use of <b>-e</b> and/or <b>-f</b>), each pattern is applied to
|
||||
each line in the order in which they are defined, except that all the <b>-e</b>
|
||||
patterns are tried before the <b>-f</b> patterns.
|
||||
</P>
|
||||
<P>
|
||||
By default, as soon as one pattern matches (or fails to match when <b>-v</b> is
|
||||
used), no further patterns are considered. However, if <b>--colour</b> (or
|
||||
<b>--color</b>) is used to colour the matching substrings, or if
|
||||
<b>--only-matching</b>, <b>--file-offsets</b>, or <b>--line-offsets</b> is used to
|
||||
output only the part of the line that matched (either shown literally, or as an
|
||||
offset), scanning resumes immediately following the match, so that further
|
||||
matches on the same line can be found. If there are multiple patterns, they are
|
||||
all tried on the remainder of the line, but patterns that follow the one that
|
||||
matched are not tried on the earlier part of the line.
|
||||
</P>
|
||||
<P>
|
||||
This is the same behaviour as GNU grep, but it does mean that the order in
|
||||
which multiple patterns are specified can affect the output when one of the
|
||||
above options is used.
|
||||
</P>
|
||||
<P>
|
||||
Patterns that can match an empty string are accepted, but empty string
|
||||
matches are not recognized. An example is the pattern "(super)?(man)?", in
|
||||
which all components are optional. This pattern finds all occurrences of both
|
||||
"super" and "man"; the output differs from matching with "super|man" when only
|
||||
the matching substrings are being shown.
|
||||
</P>
|
||||
<P>
|
||||
If the <b>LC_ALL</b> or <b>LC_CTYPE</b> environment variable is set,
|
||||
<b>pcregrep</b> uses the value to set a locale when calling the PCRE library.
|
||||
The <b>--locale</b> option can be used to override this.
|
||||
</P>
|
||||
<br><a name="SEC3" href="#TOC1">OPTIONS</a><br>
|
||||
<br><a name="SEC3" href="#TOC1">SUPPORT FOR COMPRESSED FILES</a><br>
|
||||
<P>
|
||||
It is possible to compile <b>pcregrep</b> so that it uses <b>libz</b> or
|
||||
<b>libbz2</b> to read files whose names end in <b>.gz</b> or <b>.bz2</b>,
|
||||
respectively. You can find out whether your binary has support for one or both
|
||||
of these file types by running it with the <b>--help</b> option. If the
|
||||
appropriate support is not present, files are treated as plain text. The
|
||||
standard input is always so treated.
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">OPTIONS</a><br>
|
||||
<P>
|
||||
<b>--</b>
|
||||
This terminate the list of options. It is useful if the next item on the
|
||||
@@ -124,16 +162,21 @@ equals sign.
|
||||
</P>
|
||||
<P>
|
||||
<b>--colour=</b><i>value</i>, <b>--color=</b><i>value</i>
|
||||
This option specifies under what circumstances the part of a line that matched
|
||||
a pattern should be coloured in the output. The value may be "never" (the
|
||||
default), "always", or "auto". In the latter case, colouring happens only if
|
||||
the standard output is connected to a terminal. The colour can be specified by
|
||||
setting the environment variable PCREGREP_COLOUR or PCREGREP_COLOR. The value
|
||||
of this variable should be a string of two numbers, separated by a semicolon.
|
||||
They are copied directly into the control string for setting colour on a
|
||||
terminal, so it is your responsibility to ensure that they make sense. If
|
||||
neither of the environment variables is set, the default is "1;31", which gives
|
||||
red.
|
||||
This option specifies under what circumstances the parts of a line that matched
|
||||
a pattern should be coloured in the output. By default, the output is not
|
||||
coloured. The value (which is optional, see above) may be "never", "always", or
|
||||
"auto". In the latter case, colouring happens only if the standard output is
|
||||
connected to a terminal. More resources are used when colouring is enabled,
|
||||
because <b>pcregrep</b> has to search for all possible matches in a line, not
|
||||
just one, in order to colour them all.
|
||||
</P>
|
||||
<P>
|
||||
The colour that is used can be specified by setting the environment variable
|
||||
PCREGREP_COLOUR or PCREGREP_COLOR. The value of this variable should be a
|
||||
string of two numbers, separated by a semicolon. They are copied directly into
|
||||
the control string for setting colour on a terminal, so it is your
|
||||
responsibility to ensure that they make sense. If neither of the environment
|
||||
variables is set, the default is "1;31", which gives red.
|
||||
</P>
|
||||
<P>
|
||||
<b>-D</b> <i>action</i>, <b>--devices=</b><i>action</i>
|
||||
@@ -150,30 +193,43 @@ are read as if they were ordinary files. In some operating systems the effect
|
||||
of reading a directory like this is an immediate end-of-file.
|
||||
</P>
|
||||
<P>
|
||||
<b>-e</b> <i>pattern</i>, <b>--regex=</b><i>pattern</i>,
|
||||
<b>--regexp=</b><i>pattern</i> Specify a pattern to be matched. This option can
|
||||
be used multiple times in order to specify several patterns. It can also be
|
||||
used as a way of specifying a single pattern that starts with a hyphen. When
|
||||
<b>-e</b> is used, no argument pattern is taken from the command line; all
|
||||
arguments are treated as file names. There is an overall maximum of 100
|
||||
patterns. They are applied to each line in the order in which they are defined
|
||||
until one matches (or fails to match if <b>-v</b> is used). If <b>-f</b> is used
|
||||
with <b>-e</b>, the command line patterns are matched first, followed by the
|
||||
patterns from the file, independent of the order in which these options are
|
||||
specified. Note that multiple use of <b>-e</b> is not the same as a single
|
||||
pattern with alternatives. For example, X|Y finds the first character in a line
|
||||
that is X or Y, whereas if the two patterns are given separately,
|
||||
<b>pcregrep</b> finds X if it is present, even if it follows Y in the line. It
|
||||
finds Y only if there is no X in the line. This really matters only if you are
|
||||
using <b>-o</b> to show the portion of the line that matched.
|
||||
<b>-e</b> <i>pattern</i>, <b>--regex=</b><i>pattern</i>, <b>--regexp=</b><i>pattern</i>
|
||||
Specify a pattern to be matched. This option can be used multiple times in
|
||||
order to specify several patterns. It can also be used as a way of specifying a
|
||||
single pattern that starts with a hyphen. When <b>-e</b> is used, no argument
|
||||
pattern is taken from the command line; all arguments are treated as file
|
||||
names. There is an overall maximum of 100 patterns. They are applied to each
|
||||
line in the order in which they are defined until one matches (or fails to
|
||||
match if <b>-v</b> is used). If <b>-f</b> is used with <b>-e</b>, the command line
|
||||
patterns are matched first, followed by the patterns from the file, independent
|
||||
of the order in which these options are specified. Note that multiple use of
|
||||
<b>-e</b> is not the same as a single pattern with alternatives. For example,
|
||||
X|Y finds the first character in a line that is X or Y, whereas if the two
|
||||
patterns are given separately, <b>pcregrep</b> finds X if it is present, even if
|
||||
it follows Y in the line. It finds Y only if there is no X in the line. This
|
||||
really matters only if you are using <b>-o</b> to show the part(s) of the line
|
||||
that matched.
|
||||
</P>
|
||||
<P>
|
||||
<b>--exclude</b>=<i>pattern</i>
|
||||
When <b>pcregrep</b> is searching the files in a directory as a consequence of
|
||||
the <b>-r</b> (recursive search) option, any files whose names match the pattern
|
||||
are excluded. The pattern is a PCRE regular expression. If a file name matches
|
||||
both <b>--include</b> and <b>--exclude</b>, it is excluded. There is no short
|
||||
form for this option.
|
||||
the <b>-r</b> (recursive search) option, any regular files whose names match the
|
||||
pattern are excluded. Subdirectories are not excluded by this option; they are
|
||||
searched recursively, subject to the <b>--exclude_dir</b> and
|
||||
<b>--include_dir</b> options. The pattern is a PCRE regular expression, and is
|
||||
matched against the final component of the file name (not the entire path). If
|
||||
a file name matches both <b>--include</b> and <b>--exclude</b>, it is excluded.
|
||||
There is no short form for this option.
|
||||
</P>
|
||||
<P>
|
||||
<b>--exclude_dir</b>=<i>pattern</i>
|
||||
When <b>pcregrep</b> is searching the contents of a directory as a consequence
|
||||
of the <b>-r</b> (recursive search) option, any subdirectories whose names match
|
||||
the pattern are excluded. (Note that the \fP--exclude\fP option does not affect
|
||||
subdirectories.) The pattern is a PCRE regular expression, and is matched
|
||||
against the final component of the name (not the entire path). If a
|
||||
subdirectory name matches both <b>--include_dir</b> and <b>--exclude_dir</b>, it
|
||||
is excluded. There is no short form for this option.
|
||||
</P>
|
||||
<P>
|
||||
<b>-F</b>, <b>--fixed-strings</b>
|
||||
@@ -193,27 +249,37 @@ present; they are tested before the file's patterns. However, no other pattern
|
||||
is taken from the command line; all arguments are treated as file names. There
|
||||
is an overall maximum of 100 patterns. Trailing white space is removed from
|
||||
each line, and blank lines are ignored. An empty file contains no patterns and
|
||||
therefore matches nothing.
|
||||
therefore matches nothing. See also the comments about multiple patterns versus
|
||||
a single pattern with alternatives in the description of <b>-e</b> above.
|
||||
</P>
|
||||
<P>
|
||||
<b>--file-offsets</b>
|
||||
Instead of showing lines or parts of lines that match, show each match as an
|
||||
offset from the start of the file and a length, separated by a comma. In this
|
||||
mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>, and <b>-C</b>
|
||||
options are ignored. If there is more than one match in a line, each of them is
|
||||
shown separately. This option is mutually exclusive with <b>--line-offsets</b>
|
||||
and <b>--only-matching</b>.
|
||||
</P>
|
||||
<P>
|
||||
<b>-H</b>, <b>--with-filename</b>
|
||||
Force the inclusion of the filename at the start of output lines when searching
|
||||
a single file. By default, the filename is not shown in this case. For matching
|
||||
lines, the filename is followed by a colon and a space; for context lines, a
|
||||
hyphen separator is used. If a line number is also being output, it follows the
|
||||
file name without a space.
|
||||
lines, the filename is followed by a colon; for context lines, a hyphen
|
||||
separator is used. If a line number is also being output, it follows the file
|
||||
name.
|
||||
</P>
|
||||
<P>
|
||||
<b>-h</b>, <b>--no-filename</b>
|
||||
Suppress the output filenames when searching multiple files. By default,
|
||||
filenames are shown when multiple files are searched. For matching lines, the
|
||||
filename is followed by a colon and a space; for context lines, a hyphen
|
||||
separator is used. If a line number is also being output, it follows the file
|
||||
name without a space.
|
||||
filename is followed by a colon; for context lines, a hyphen separator is used.
|
||||
If a line number is also being output, it follows the file name.
|
||||
</P>
|
||||
<P>
|
||||
<b>--help</b>
|
||||
Output a brief help message and exit.
|
||||
Output a help message, giving brief details of the command options and file
|
||||
type support, and then exit.
|
||||
</P>
|
||||
<P>
|
||||
<b>-i</b>, <b>--ignore-case</b>
|
||||
@@ -222,10 +288,23 @@ Ignore upper/lower case distinctions during comparisons.
|
||||
<P>
|
||||
<b>--include</b>=<i>pattern</i>
|
||||
When <b>pcregrep</b> is searching the files in a directory as a consequence of
|
||||
the <b>-r</b> (recursive search) option, only those files whose names match the
|
||||
pattern are included. The pattern is a PCRE regular expression. If a file name
|
||||
matches both <b>--include</b> and <b>--exclude</b>, it is excluded. There is no
|
||||
short form for this option.
|
||||
the <b>-r</b> (recursive search) option, only those regular files whose names
|
||||
match the pattern are included. Subdirectories are always included and searched
|
||||
recursively, subject to the \fP--include_dir\fP and <b>--exclude_dir</b>
|
||||
options. The pattern is a PCRE regular expression, and is matched against the
|
||||
final component of the file name (not the entire path). If a file name matches
|
||||
both <b>--include</b> and <b>--exclude</b>, it is excluded. There is no short
|
||||
form for this option.
|
||||
</P>
|
||||
<P>
|
||||
<b>--include_dir</b>=<i>pattern</i>
|
||||
When <b>pcregrep</b> is searching the contents of a directory as a consequence
|
||||
of the <b>-r</b> (recursive search) option, only those subdirectories whose
|
||||
names match the pattern are included. (Note that the <b>--include</b> option
|
||||
does not affect subdirectories.) The pattern is a PCRE regular expression, and
|
||||
is matched against the final component of the name (not the entire path). If a
|
||||
subdirectory name matches both <b>--include_dir</b> and <b>--exclude_dir</b>, it
|
||||
is excluded. There is no short form for this option.
|
||||
</P>
|
||||
<P>
|
||||
<b>-L</b>, <b>--files-without-match</b>
|
||||
@@ -247,6 +326,16 @@ are being output. If not supplied, "(standard input)" is used. There is no
|
||||
short form for this option.
|
||||
</P>
|
||||
<P>
|
||||
<b>--line-offsets</b>
|
||||
Instead of showing lines or parts of lines that match, show each match as a
|
||||
line number, the offset from the start of the line, and a length. The line
|
||||
number is terminated by a colon (as usual; see the <b>-n</b> option), and the
|
||||
offset and length are separated by a comma. In this mode, no context is shown.
|
||||
That is, the <b>-A</b>, <b>-B</b>, and <b>-C</b> options are ignored. If there is
|
||||
more than one match in a line, each of them is shown separately. This option is
|
||||
mutually exclusive with <b>--file-offsets</b> and <b>--only-matching</b>.
|
||||
</P>
|
||||
<P>
|
||||
<b>--locale</b>=<i>locale-name</i>
|
||||
This option specifies a locale to be used for pattern matching. It overrides
|
||||
the value in the <b>LC_ALL</b> or <b>LC_CTYPE</b> environment variables. If no
|
||||
@@ -268,28 +357,41 @@ are guaranteed to be available for lookbehind assertions.
|
||||
</P>
|
||||
<P>
|
||||
<b>-N</b> <i>newline-type</i>, <b>--newline=</b><i>newline-type</i>
|
||||
The PCRE library supports three different character sequences for indicating
|
||||
The PCRE library supports five different conventions for indicating
|
||||
the ends of lines. They are the single-character sequences CR (carriage return)
|
||||
and LF (linefeed), and the two-character sequence CR, LF. When the library is
|
||||
built, a default line-ending sequence is specified. This is normally the
|
||||
standard sequence for the operating system. Unless otherwise specified by this
|
||||
option, <b>pcregrep</b> uses the default. The possible values for this option
|
||||
are CR, LF, or CRLF. This makes it possible to use <b>pcregrep</b> on files that
|
||||
have come from other environments without having to modify their line endings.
|
||||
If the data that is being scanned does not agree with the convention set by
|
||||
this option, <b>pcregrep</b> may behave in strange ways.
|
||||
and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
|
||||
which recognizes any of the preceding three types, and an "any" convention, in
|
||||
which any Unicode line ending sequence is assumed to end a line. The Unicode
|
||||
sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
|
||||
(formfeed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
|
||||
PS (paragraph separator, U+2029).
|
||||
<br>
|
||||
<br>
|
||||
When the PCRE library is built, a default line-ending sequence is specified.
|
||||
This is normally the standard sequence for the operating system. Unless
|
||||
otherwise specified by this option, <b>pcregrep</b> uses the library's default.
|
||||
The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
|
||||
makes it possible to use <b>pcregrep</b> on files that have come from other
|
||||
environments without having to modify their line endings. If the data that is
|
||||
being scanned does not agree with the convention set by this option,
|
||||
<b>pcregrep</b> may behave in strange ways.
|
||||
</P>
|
||||
<P>
|
||||
<b>-n</b>, <b>--line-number</b>
|
||||
Precede each output line by its line number in the file, followed by a colon
|
||||
and a space for matching lines or a hyphen and a space for context lines. If
|
||||
the filename is also being output, it precedes the line number.
|
||||
for matching lines or a hyphen for context lines. If the filename is also being
|
||||
output, it precedes the line number. This option is forced if
|
||||
<b>--line-offsets</b> is used.
|
||||
</P>
|
||||
<P>
|
||||
<b>-o</b>, <b>--only-matching</b>
|
||||
Show only the part of the line that matched a pattern. In this mode, no
|
||||
context is shown. That is, the <b>-A</b>, <b>-B</b>, and <b>-C</b> options are
|
||||
ignored.
|
||||
ignored. If there is more than one match in a line, each of them is shown
|
||||
separately. If <b>-o</b> is combined with <b>-v</b> (invert the sense of the
|
||||
match to find non-matching lines), no output is generated, but the return code
|
||||
is set appropriately. This option is mutually exclusive with
|
||||
<b>--file-offsets</b> and <b>--line-offsets</b>.
|
||||
</P>
|
||||
<P>
|
||||
<b>-q</b>, <b>--quiet</b>
|
||||
@@ -332,20 +434,20 @@ Force the patterns to match only whole words. This is equivalent to having \b
|
||||
at the start and end of the pattern.
|
||||
</P>
|
||||
<P>
|
||||
<b>-x</b>, <b>--line-regex</b>, \fP--line-regexp\fP
|
||||
<b>-x</b>, <b>--line-regex</b>, <b>--line-regexp</b>
|
||||
Force the patterns to be anchored (each must start matching at the beginning of
|
||||
a line) and in addition, require them to match entire lines. This is
|
||||
equivalent to having ^ and $ characters at the start and end of each
|
||||
alternative branch in every pattern.
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
|
||||
<br><a name="SEC5" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
|
||||
<P>
|
||||
The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
|
||||
order, for a locale. The first one that is set is used. This can be overridden
|
||||
by the <b>--locale</b> option. If no locale is set, the PCRE library's default
|
||||
(usually the "C" locale) is used.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">NEWLINES</a><br>
|
||||
<br><a name="SEC6" href="#TOC1">NEWLINES</a><br>
|
||||
<P>
|
||||
The <b>-N</b> (<b>--newline</b>) option allows <b>pcregrep</b> to scan files with
|
||||
different newline conventions from the default. However, the setting of this
|
||||
@@ -354,7 +456,7 @@ the standard error and output streams. It uses the string "\n" in C
|
||||
<b>printf()</b> calls to indicate newlines, relying on the C I/O library to
|
||||
convert this to an appropriate sequence if the output is sent to a file.
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">OPTIONS COMPATIBILITY</a><br>
|
||||
<br><a name="SEC7" href="#TOC1">OPTIONS COMPATIBILITY</a><br>
|
||||
<P>
|
||||
The majority of short and long forms of <b>pcregrep</b>'s options are the same
|
||||
as in the GNU <b>grep</b> program. Any long option of the form
|
||||
@@ -362,7 +464,7 @@ as in the GNU <b>grep</b> program. Any long option of the form
|
||||
(PCRE terminology). However, the <b>--locale</b>, <b>-M</b>, <b>--multiline</b>,
|
||||
<b>-u</b>, and <b>--utf-8</b> options are specific to <b>pcregrep</b>.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">OPTIONS WITH DATA</a><br>
|
||||
<br><a name="SEC8" href="#TOC1">OPTIONS WITH DATA</a><br>
|
||||
<P>
|
||||
There are four different ways in which an option with data can be specified.
|
||||
If a short form option is used, the data may follow immediately, or in the next
|
||||
@@ -389,7 +491,7 @@ for which the data is optional. If this option does have data, it must be given
|
||||
in the first form, using an equals character. Otherwise it will be assumed that
|
||||
it has no data.
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">MATCHING ERRORS</a><br>
|
||||
<br><a name="SEC9" href="#TOC1">MATCHING ERRORS</a><br>
|
||||
<P>
|
||||
It is possible to supply a regular expression that takes a very long time to
|
||||
fail to match certain lines. Such patterns normally involve nested indefinite
|
||||
@@ -399,7 +501,7 @@ in these circumstances. If this happens, <b>pcregrep</b> outputs an error
|
||||
message and the line that caused the problem to the standard error stream. If
|
||||
there are more than 20 such errors, <b>pcregrep</b> gives up.
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">DIAGNOSTICS</a><br>
|
||||
<br><a name="SEC10" href="#TOC1">DIAGNOSTICS</a><br>
|
||||
<P>
|
||||
Exit status is 0 if any matches were found, 1 if no matches were found, and 2
|
||||
for syntax errors and non-existent or inacessible files (even if matches were
|
||||
@@ -407,18 +509,25 @@ found in other files) or too many matching errors. Using the <b>-s</b> option to
|
||||
suppress error messages about inaccessble files does not affect the return
|
||||
code.
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">AUTHOR</a><br>
|
||||
<br><a name="SEC11" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcrepattern</b>(3), <b>pcretest</b>(1).
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
<br>
|
||||
Cambridge CB2 3QG, England.
|
||||
</P>
|
||||
<P>
|
||||
Last updated: 06 June 2006
|
||||
Cambridge CB2 3QH, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 01 March 2009
|
||||
<br>
|
||||
Copyright © 1997-2009 University of Cambridge.
|
||||
<br>
|
||||
Copyright © 1997-2006 University of Cambridge.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
|
||||
@@ -16,9 +16,11 @@ man page, in case the conversion went wrong.
|
||||
<li><a name="TOC1" href="#SEC1">PCRE MATCHING ALGORITHMS</a>
|
||||
<li><a name="TOC2" href="#SEC2">REGULAR EXPRESSIONS AS TREES</a>
|
||||
<li><a name="TOC3" href="#SEC3">THE STANDARD MATCHING ALGORITHM</a>
|
||||
<li><a name="TOC4" href="#SEC4">THE DFA MATCHING ALGORITHM</a>
|
||||
<li><a name="TOC5" href="#SEC5">ADVANTAGES OF THE DFA ALGORITHM</a>
|
||||
<li><a name="TOC6" href="#SEC6">DISADVANTAGES OF THE DFA ALGORITHM</a>
|
||||
<li><a name="TOC4" href="#SEC4">THE ALTERNATIVE MATCHING ALGORITHM</a>
|
||||
<li><a name="TOC5" href="#SEC5">ADVANTAGES OF THE ALTERNATIVE ALGORITHM</a>
|
||||
<li><a name="TOC6" href="#SEC6">DISADVANTAGES OF THE ALTERNATIVE ALGORITHM</a>
|
||||
<li><a name="TOC7" href="#SEC7">AUTHOR</a>
|
||||
<li><a name="TOC8" href="#SEC8">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">PCRE MATCHING ALGORITHMS</a><br>
|
||||
<P>
|
||||
@@ -46,7 +48,7 @@ is matched against the string
|
||||
<something> <something else> <something further>
|
||||
</pre>
|
||||
there are three possible answers. The standard algorithm finds only one of
|
||||
them, whereas the DFA algorithm finds all three.
|
||||
them, whereas the alternative algorithm finds all three.
|
||||
</P>
|
||||
<br><a name="SEC2" href="#TOC1">REGULAR EXPRESSIONS AS TREES</a><br>
|
||||
<P>
|
||||
@@ -59,8 +61,8 @@ correspond to the two matching algorithms provided by PCRE.
|
||||
</P>
|
||||
<br><a name="SEC3" href="#TOC1">THE STANDARD MATCHING ALGORITHM</a><br>
|
||||
<P>
|
||||
In the terminology of Jeffrey Friedl's book \fIMastering Regular
|
||||
Expressions\fP, the standard algorithm is an "NFA algorithm". It conducts a
|
||||
In the terminology of Jeffrey Friedl's book "Mastering Regular
|
||||
Expressions", the standard algorithm is an "NFA algorithm". It conducts a
|
||||
depth-first search of the pattern tree. That is, it proceeds along a single
|
||||
path through the tree, checking that the subject matches what is required. When
|
||||
there is a mismatch, the algorithm tries any alternatives at the current point,
|
||||
@@ -83,14 +85,15 @@ straightforward for this algorithm to keep track of the substrings that are
|
||||
matched by portions of the pattern in parentheses. This provides support for
|
||||
capturing parentheses and back references.
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">THE DFA MATCHING ALGORITHM</a><br>
|
||||
<br><a name="SEC4" href="#TOC1">THE ALTERNATIVE MATCHING ALGORITHM</a><br>
|
||||
<P>
|
||||
DFA stands for "deterministic finite automaton", but you do not need to
|
||||
understand the origins of that name. This algorithm conducts a breadth-first
|
||||
search of the tree. Starting from the first matching point in the subject, it
|
||||
scans the subject string from left to right, once, character by character, and
|
||||
as it does this, it remembers all the paths through the tree that represent
|
||||
valid matches.
|
||||
This algorithm conducts a breadth-first search of the tree. Starting from the
|
||||
first matching point in the subject, it scans the subject string from left to
|
||||
right, once, character by character, and as it does this, it remembers all the
|
||||
paths through the tree that represent valid matches. In Friedl's terminology,
|
||||
this is a kind of "DFA algorithm", though it is not implemented as a
|
||||
traditional finite state machine (it keeps multiple states active
|
||||
simultaneously).
|
||||
</P>
|
||||
<P>
|
||||
The scan continues until either the end of the subject is reached, or there are
|
||||
@@ -114,12 +117,21 @@ matches that start at later positions.
|
||||
</P>
|
||||
<P>
|
||||
There are a number of features of PCRE regular expressions that are not
|
||||
supported by the DFA matching algorithm. They are as follows:
|
||||
supported by the alternative matching algorithm. They are as follows:
|
||||
</P>
|
||||
<P>
|
||||
1. Because the algorithm finds all possible matches, the greedy or ungreedy
|
||||
nature of repetition quantifiers is not relevant. Greedy and ungreedy
|
||||
quantifiers are treated in exactly the same way.
|
||||
quantifiers are treated in exactly the same way. However, possessive
|
||||
quantifiers can make a difference when what follows could also match what is
|
||||
quantified, for example in a pattern like this:
|
||||
<pre>
|
||||
^a++\w!
|
||||
</pre>
|
||||
This pattern matches "aaab!" but not "aaa!", which would be matched by a
|
||||
non-possessive quantifier. Similarly, if an atomic group is present, it is
|
||||
matched as if it were a standalone pattern at the current point, and the
|
||||
longest match is then "locked in" for the rest of the overall pattern.
|
||||
</P>
|
||||
<P>
|
||||
2. When dealing with multiple paths through the tree simultaneously, it is not
|
||||
@@ -133,22 +145,30 @@ not supported, and cause errors if encountered.
|
||||
</P>
|
||||
<P>
|
||||
4. For the same reason, conditional expressions that use a backreference as the
|
||||
condition are not supported.
|
||||
condition or test for a specific group recursion are not supported.
|
||||
</P>
|
||||
<P>
|
||||
5. Callouts are supported, but the value of the <i>capture_top</i> field is
|
||||
5. Because many paths through the tree may be active, the \K escape sequence,
|
||||
which resets the start of the match when encountered (but may be on some paths
|
||||
and not on others), is not supported. It causes an error if encountered.
|
||||
</P>
|
||||
<P>
|
||||
6. Callouts are supported, but the value of the <i>capture_top</i> field is
|
||||
always 1, and the value of the <i>capture_last</i> field is always -1.
|
||||
</P>
|
||||
<P>
|
||||
6.
|
||||
The \C escape sequence, which (in the standard algorithm) matches a single
|
||||
byte, even in UTF-8 mode, is not supported because the DFA algorithm moves
|
||||
through the subject string one character at a time, for all active paths
|
||||
7. The \C escape sequence, which (in the standard algorithm) matches a single
|
||||
byte, even in UTF-8 mode, is not supported because the alternative algorithm
|
||||
moves through the subject string one character at a time, for all active paths
|
||||
through the tree.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">ADVANTAGES OF THE DFA ALGORITHM</a><br>
|
||||
<P>
|
||||
Using the DFA matching algorithm provides the following advantages:
|
||||
8. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not
|
||||
supported. (*FAIL) is supported, and behaves like a failing negative assertion.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">ADVANTAGES OF THE ALTERNATIVE ALGORITHM</a><br>
|
||||
<P>
|
||||
Using the alternative matching algorithm provides the following advantages:
|
||||
</P>
|
||||
<P>
|
||||
1. All possible matches (at a single point in the subject) are automatically
|
||||
@@ -159,17 +179,18 @@ callouts.
|
||||
<P>
|
||||
2. There is much better support for partial matching. The restrictions on the
|
||||
content of the pattern that apply when using the standard algorithm for partial
|
||||
matching do not apply to the DFA algorithm. For non-anchored patterns, the
|
||||
starting position of a partial match is available.
|
||||
matching do not apply to the alternative algorithm. For non-anchored patterns,
|
||||
the starting position of a partial match is available.
|
||||
</P>
|
||||
<P>
|
||||
3. Because the DFA algorithm scans the subject string just once, and never
|
||||
needs to backtrack, it is possible to pass very long subject strings to the
|
||||
matching function in several pieces, checking for partial matching each time.
|
||||
3. Because the alternative algorithm scans the subject string just once, and
|
||||
never needs to backtrack, it is possible to pass very long subject strings to
|
||||
the matching function in several pieces, checking for partial matching each
|
||||
time.
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">DISADVANTAGES OF THE DFA ALGORITHM</a><br>
|
||||
<br><a name="SEC6" href="#TOC1">DISADVANTAGES OF THE ALTERNATIVE ALGORITHM</a><br>
|
||||
<P>
|
||||
The DFA algorithm suffers from a number of disadvantages:
|
||||
The alternative algorithm suffers from a number of disadvantages:
|
||||
</P>
|
||||
<P>
|
||||
1. It is substantially slower than the standard algorithm. This is partly
|
||||
@@ -180,13 +201,24 @@ less susceptible to optimization.
|
||||
2. Capturing parentheses and back references are not supported.
|
||||
</P>
|
||||
<P>
|
||||
3. The "atomic group" feature of PCRE regular expressions is supported, but
|
||||
does not provide the advantage that it does for the standard algorithm.
|
||||
3. Although atomic groups are supported, their use does not provide the
|
||||
performance advantage that it does for the standard algorithm.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Last updated: 06 June 2006
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
<br>
|
||||
Cambridge CB2 3QH, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 19 April 2008
|
||||
<br>
|
||||
Copyright © 1997-2008 University of Cambridge.
|
||||
<br>
|
||||
Copyright © 1997-2006 University of Cambridge.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
|
||||
@@ -17,6 +17,8 @@ man page, in case the conversion went wrong.
|
||||
<li><a name="TOC2" href="#SEC2">RESTRICTED PATTERNS FOR PCRE_PARTIAL</a>
|
||||
<li><a name="TOC3" href="#SEC3">EXAMPLE OF PARTIAL MATCHING USING PCRETEST</a>
|
||||
<li><a name="TOC4" href="#SEC4">MULTI-SEGMENT MATCHING WITH pcre_dfa_exec()</a>
|
||||
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
|
||||
<li><a name="TOC6" href="#SEC6">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">PARTIAL MATCHING IN PCRE</a><br>
|
||||
<P>
|
||||
@@ -90,6 +92,8 @@ envisaged for this facility, this is not felt to be a major restriction.
|
||||
<P>
|
||||
If PCRE_PARTIAL is set for a pattern that does not conform to the restrictions,
|
||||
<b>pcre_exec()</b> returns the error code PCRE_ERROR_BADPARTIAL (-13).
|
||||
You can use the PCRE_INFO_OKPARTIAL call to <b>pcre_fullinfo()</b> to find out
|
||||
if a compiled pattern can be used for partial matching.
|
||||
</P>
|
||||
<br><a name="SEC3" href="#TOC1">EXAMPLE OF PARTIAL MATCHING USING PCRETEST</a><br>
|
||||
<P>
|
||||
@@ -112,8 +116,9 @@ uses the date example quoted above:
|
||||
</pre>
|
||||
The first data string is matched completely, so <b>pcretest</b> shows the
|
||||
matched substrings. The remaining four strings do not match the complete
|
||||
pattern, but the first two are partial matches. The same test, using DFA
|
||||
matching (by means of the \D escape sequence), produces the following output:
|
||||
pattern, but the first two are partial matches. The same test, using
|
||||
<b>pcre_dfa_exec()</b> matching (by means of the \D escape sequence), produces
|
||||
the following output:
|
||||
<pre>
|
||||
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||
data> 25jun04\P\D
|
||||
@@ -134,11 +139,11 @@ available.
|
||||
<P>
|
||||
When a partial match has been found using <b>pcre_dfa_exec()</b>, it is possible
|
||||
to continue the match by providing additional subject data and calling
|
||||
<b>pcre_dfa_exec()</b> again with the PCRE_DFA_RESTART option and the same
|
||||
working space (where details of the previous partial match are stored). Here is
|
||||
an example using <b>pcretest</b>, where the \R escape sequence sets the
|
||||
PCRE_DFA_RESTART option and the \D escape sequence requests the use of
|
||||
<b>pcre_dfa_exec()</b>:
|
||||
<b>pcre_dfa_exec()</b> again with the same compiled regular expression, this
|
||||
time setting the PCRE_DFA_RESTART option. You must also pass the same working
|
||||
space as before, because this is where details of the previous partial match
|
||||
are stored. Here is an example using <b>pcretest</b>, using the \R escape
|
||||
sequence to set the PCRE_DFA_RESTART option (\P and \D are as above):
|
||||
<pre>
|
||||
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||
data> 23ja\P\D
|
||||
@@ -153,9 +158,10 @@ not retain the previously partially-matched string. It is up to the calling
|
||||
program to do that if it needs to.
|
||||
</P>
|
||||
<P>
|
||||
This facility can be used to pass very long subject strings to
|
||||
<b>pcre_dfa_exec()</b>. However, some care is needed for certain types of
|
||||
pattern.
|
||||
You can set PCRE_PARTIAL with PCRE_DFA_RESTART to continue partial matching
|
||||
over multiple segments. This facility can be used to pass very long subject
|
||||
strings to <b>pcre_dfa_exec()</b>. However, some care is needed for certain
|
||||
types of pattern.
|
||||
</P>
|
||||
<P>
|
||||
1. If the pattern contains tests for the beginning or end of a line, you need
|
||||
@@ -165,7 +171,7 @@ subject string for any call does not contain the beginning or end of a line.
|
||||
<P>
|
||||
2. If the pattern contains backward assertions (including \b or \B), you need
|
||||
to arrange for some overlap in the subject strings to allow for this. For
|
||||
example, you could pass the subject in chunks that were 500 bytes long, but in
|
||||
example, you could pass the subject in chunks that are 500 bytes long, but in
|
||||
a buffer of 700 bytes, with the starting offset set to 200 and the previous 200
|
||||
bytes at the start of the buffer.
|
||||
</P>
|
||||
@@ -174,7 +180,7 @@ bytes at the start of the buffer.
|
||||
always produce exactly the same result as matching over one single long string.
|
||||
The difference arises when there are multiple matching possibilities, because a
|
||||
partial match result is given only when there are no completed matches in a
|
||||
call to fBpcre_dfa_exec()\fP. This means that as soon as the shortest match has
|
||||
call to <b>pcre_dfa_exec()</b>. This means that as soon as the shortest match has
|
||||
been found, continuation to a new subject segment is no longer possible.
|
||||
Consider this <b>pcretest</b> example:
|
||||
<pre>
|
||||
@@ -216,10 +222,21 @@ patterns or patterns such as:
|
||||
</pre>
|
||||
where no string can be a partial match for both alternatives.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Last updated: 16 January 2006
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
<br>
|
||||
Cambridge CB2 3QH, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 04 June 2007
|
||||
<br>
|
||||
Copyright © 1997-2007 University of Cambridge.
|
||||
<br>
|
||||
Copyright © 1997-2006 University of Cambridge.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
|
||||
+813
-212
File diff suppressed because it is too large
Load Diff
@@ -16,13 +16,73 @@ man page, in case the conversion went wrong.
|
||||
PCRE PERFORMANCE
|
||||
</b><br>
|
||||
<P>
|
||||
Certain items that may appear in regular expression patterns are more efficient
|
||||
Two aspects of performance are discussed below: memory usage and processing
|
||||
time. The way you express your pattern as a regular expression can affect both
|
||||
of them.
|
||||
</P>
|
||||
<br><b>
|
||||
MEMORY USAGE
|
||||
</b><br>
|
||||
<P>
|
||||
Patterns are compiled by PCRE into a reasonably efficient byte code, so that
|
||||
most simple patterns do not use much memory. However, there is one case where
|
||||
memory usage can be unexpectedly large. When a parenthesized subpattern has a
|
||||
quantifier with a minimum greater than 1 and/or a limited maximum, the whole
|
||||
subpattern is repeated in the compiled code. For example, the pattern
|
||||
<pre>
|
||||
(abc|def){2,4}
|
||||
</pre>
|
||||
is compiled as if it were
|
||||
<pre>
|
||||
(abc|def)(abc|def)((abc|def)(abc|def)?)?
|
||||
</pre>
|
||||
(Technical aside: It is done this way so that backtrack points within each of
|
||||
the repetitions can be independently maintained.)
|
||||
</P>
|
||||
<P>
|
||||
For regular expressions whose quantifiers use only small numbers, this is not
|
||||
usually a problem. However, if the numbers are large, and particularly if such
|
||||
repetitions are nested, the memory usage can become an embarrassment. For
|
||||
example, the very simple pattern
|
||||
<pre>
|
||||
((ab){1,1000}c){1,3}
|
||||
</pre>
|
||||
uses 51K bytes when compiled. When PCRE is compiled with its default internal
|
||||
pointer size of two bytes, the size limit on a compiled pattern is 64K, and
|
||||
this is reached with the above pattern if the outer repetition is increased
|
||||
from 3 to 4. PCRE can be compiled to use larger internal pointers and thus
|
||||
handle larger compiled patterns, but it is better to try to rewrite your
|
||||
pattern to use less memory if you can.
|
||||
</P>
|
||||
<P>
|
||||
One way of reducing the memory usage for such patterns is to make use of PCRE's
|
||||
<a href="pcrepattern.html#subpatternsassubroutines">"subroutine"</a>
|
||||
facility. Re-writing the above pattern as
|
||||
<pre>
|
||||
((ab)(?2){0,999}c)(?1){0,2}
|
||||
</pre>
|
||||
reduces the memory requirements to 18K, and indeed it remains under 20K even
|
||||
with the outer repetition increased to 100. However, this pattern is not
|
||||
exactly equivalent, because the "subroutine" calls are treated as
|
||||
<a href="pcrepattern.html#atomicgroup">atomic groups</a>
|
||||
into which there can be no backtracking if there is a subsequent matching
|
||||
failure. Therefore, PCRE cannot do this kind of rewriting automatically.
|
||||
Furthermore, there is a noticeable loss of speed when executing the modified
|
||||
pattern. Nevertheless, if the atomic grouping is not a problem and the loss of
|
||||
speed is acceptable, this kind of rewriting will allow you to process patterns
|
||||
that PCRE cannot otherwise handle.
|
||||
</P>
|
||||
<br><b>
|
||||
PROCESSING TIME
|
||||
</b><br>
|
||||
<P>
|
||||
Certain items in regular expression patterns are processed more efficiently
|
||||
than others. It is more efficient to use a character class like [aeiou] than a
|
||||
set of alternatives such as (a|e|i|o|u). In general, the simplest construction
|
||||
that provides the required behaviour is usually the most efficient. Jeffrey
|
||||
Friedl's book contains a lot of useful general discussion about optimizing
|
||||
regular expressions for efficient performance. This document contains a few
|
||||
observations about PCRE.
|
||||
set of single-character alternatives such as (a|e|i|o|u). In general, the
|
||||
simplest construction that provides the required behaviour is usually the most
|
||||
efficient. Jeffrey Friedl's book contains a lot of useful general discussion
|
||||
about optimizing regular expressions for efficient performance. This document
|
||||
contains a few observations about PCRE.
|
||||
</P>
|
||||
<P>
|
||||
Using Unicode character properties (the \p, \P, and \X escapes) is slow,
|
||||
@@ -58,14 +118,15 @@ Beware of patterns that contain nested indefinite repeats. These can take a
|
||||
long time to run when applied to a string that does not match. Consider the
|
||||
pattern fragment
|
||||
<pre>
|
||||
(a+)*
|
||||
^(a+)*
|
||||
</pre>
|
||||
This can match "aaaa" in 33 different ways, and this number increases very
|
||||
This can match "aaaa" in 16 different ways, and this number increases very
|
||||
rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4
|
||||
times, and for each of those cases other than 0, the + repeats can match
|
||||
times, and for each of those cases other than 0 or 4, the + repeats can match
|
||||
different numbers of times.) When the remainder of the pattern is such that the
|
||||
entire match is going to fail, PCRE has in principle to try every possible
|
||||
variation, and this can take an extremely long time.
|
||||
variation, and this can take an extremely long time, even for relatively short
|
||||
strings.
|
||||
</P>
|
||||
<P>
|
||||
An optimization catches some of the more simple cases such as
|
||||
@@ -88,10 +149,25 @@ appreciable time with strings longer than about 20 characters.
|
||||
In many cases, the solution to this kind of performance issue is to use an
|
||||
atomic group or a possessive quantifier.
|
||||
</P>
|
||||
<br><b>
|
||||
AUTHOR
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 28 February 2005
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
<br>
|
||||
Cambridge CB2 3QH, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><b>
|
||||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 06 March 2007
|
||||
<br>
|
||||
Copyright © 1997-2007 University of Cambridge.
|
||||
<br>
|
||||
Copyright © 1997-2005 University of Cambridge.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
|
||||
@@ -21,6 +21,7 @@ man page, in case the conversion went wrong.
|
||||
<li><a name="TOC6" href="#SEC6">ERROR MESSAGES</a>
|
||||
<li><a name="TOC7" href="#SEC7">MEMORY USAGE</a>
|
||||
<li><a name="TOC8" href="#SEC8">AUTHOR</a>
|
||||
<li><a name="TOC9" href="#SEC9">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">SYNOPSIS OF POSIX API</a><br>
|
||||
<P>
|
||||
@@ -58,11 +59,11 @@ command for linking an application that uses them. Because the POSIX functions
|
||||
call the native ones, it is also necessary to add <b>-lpcre</b>.
|
||||
</P>
|
||||
<P>
|
||||
I have implemented only those option bits that can be reasonably mapped to PCRE
|
||||
native options. In addition, the option REG_EXTENDED is defined with the value
|
||||
zero. This has no effect, but since programs that are written to the POSIX
|
||||
interface often use it, this makes it easier to slot in PCRE as a replacement
|
||||
library. Other POSIX options are not even defined.
|
||||
I have implemented only those POSIX option bits that can be reasonably mapped
|
||||
to PCRE native options. In addition, the option REG_EXTENDED is defined with
|
||||
the value zero. This has no effect, but since programs that are written to the
|
||||
POSIX interface often use it, this makes it easier to slot in PCRE as a
|
||||
replacement library. Other POSIX options are not even defined.
|
||||
</P>
|
||||
<P>
|
||||
When PCRE is called via these functions, it is only the API that is POSIX-like
|
||||
@@ -179,18 +180,36 @@ REG_NEWLINE action.
|
||||
<br><a name="SEC5" href="#TOC1">MATCHING A PATTERN</a><br>
|
||||
<P>
|
||||
The function <b>regexec()</b> is called to match a compiled pattern <i>preg</i>
|
||||
against a given <i>string</i>, which is terminated by a zero byte, subject to
|
||||
the options in <i>eflags</i>. These can be:
|
||||
against a given <i>string</i>, which is by default terminated by a zero byte
|
||||
(but see REG_STARTEND below), subject to the options in <i>eflags</i>. These can
|
||||
be:
|
||||
<pre>
|
||||
REG_NOTBOL
|
||||
</pre>
|
||||
The PCRE_NOTBOL option is set when calling the underlying PCRE matching
|
||||
function.
|
||||
<pre>
|
||||
REG_NOTEMPTY
|
||||
</pre>
|
||||
The PCRE_NOTEMPTY option is set when calling the underlying PCRE matching
|
||||
function. Note that REG_NOTEMPTY is not part of the POSIX standard. However,
|
||||
setting this option can give more POSIX-like behaviour in some situations.
|
||||
<pre>
|
||||
REG_NOTEOL
|
||||
</pre>
|
||||
The PCRE_NOTEOL option is set when calling the underlying PCRE matching
|
||||
function.
|
||||
<pre>
|
||||
REG_STARTEND
|
||||
</pre>
|
||||
The string is considered to start at <i>string</i> + <i>pmatch[0].rm_so</i> and
|
||||
to have a terminating NUL located at <i>string</i> + <i>pmatch[0].rm_eo</i>
|
||||
(there need not actually be a NUL at that location), regardless of the value of
|
||||
<i>nmatch</i>. This is a BSD extension, compatible with but not specified by
|
||||
IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
|
||||
intended to be portable to other systems. Note that a non-zero <i>rm_so</i> does
|
||||
not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
|
||||
how it is matched.
|
||||
</P>
|
||||
<P>
|
||||
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
|
||||
@@ -231,14 +250,17 @@ memory, after which <i>preg</i> may no longer be used as a compiled expression.
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service,
|
||||
University Computing Service
|
||||
<br>
|
||||
Cambridge CB2 3QH, England.
|
||||
<br>
|
||||
Cambridge CB2 3QG, England.
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 16 January 2006
|
||||
Last updated: 11 March 2009
|
||||
<br>
|
||||
Copyright © 1997-2009 University of Cambridge.
|
||||
<br>
|
||||
Copyright © 1997-2006 University of Cambridge.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
|
||||
@@ -17,6 +17,8 @@ man page, in case the conversion went wrong.
|
||||
<li><a name="TOC2" href="#SEC2">SAVING A COMPILED PATTERN</a>
|
||||
<li><a name="TOC3" href="#SEC3">RE-USING A PRECOMPILED PATTERN</a>
|
||||
<li><a name="TOC4" href="#SEC4">COMPATIBILITY WITH DIFFERENT PCRE RELEASES</a>
|
||||
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
|
||||
<li><a name="TOC6" href="#SEC6">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">SAVING AND RE-USING PRECOMPILED PCRE PATTERNS</a><br>
|
||||
<P>
|
||||
@@ -32,7 +34,9 @@ tables, it is a little bit more complicated.
|
||||
If you save compiled patterns to a file, you can copy them to a different host
|
||||
and run them there. This works even if the new host has the opposite endianness
|
||||
to the one on which the patterns were compiled. There may be a small
|
||||
performance penalty, but it should be insignificant.
|
||||
performance penalty, but it should be insignificant. However, compiling regular
|
||||
expressions with one version of PCRE for use with a different version is not
|
||||
guaranteed to work and may cause crashes.
|
||||
</P>
|
||||
<br><a name="SEC2" href="#TOC1">SAVING A COMPILED PATTERN</a><br>
|
||||
<P>
|
||||
@@ -120,21 +124,25 @@ usual way.
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">COMPATIBILITY WITH DIFFERENT PCRE RELEASES</a><br>
|
||||
<P>
|
||||
The layout of the control block that is at the start of the data that makes up
|
||||
a compiled pattern was changed for release 5.0. If you have any saved patterns
|
||||
that were compiled with previous releases (not a facility that was previously
|
||||
advertised), you will have to recompile them for release 5.0. However, from now
|
||||
on, it should be possible to make changes in a compatible manner.
|
||||
In general, it is safest to recompile all saved patterns when you update to a
|
||||
new PCRE release, though not all updates actually require this. Recompiling is
|
||||
definitely needed for release 7.2.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Notwithstanding the above, if you have any saved patterns in UTF-8 mode that
|
||||
use \p or \P that were compiled with any release up to and including 6.4, you
|
||||
will have to recompile them for release 6.5 and above.
|
||||
</P>
|
||||
<P>
|
||||
Last updated: 01 February 2006
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
<br>
|
||||
Cambridge CB2 3QH, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 13 June 2007
|
||||
<br>
|
||||
Copyright © 1997-2007 University of Cambridge.
|
||||
<br>
|
||||
Copyright © 1997-2006 University of Cambridge.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
|
||||
@@ -72,10 +72,25 @@ need to add
|
||||
</pre>
|
||||
(for example) to the compile command to get round this problem.
|
||||
</P>
|
||||
<br><b>
|
||||
AUTHOR
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 09 September 2004
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
<br>
|
||||
Cambridge CB2 3QH, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><b>
|
||||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 23 January 2008
|
||||
<br>
|
||||
Copyright © 1997-2008 University of Cambridge.
|
||||
<br>
|
||||
Copyright © 1997-2004 University of Cambridge.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
|
||||
@@ -63,7 +63,7 @@ frame for each matched character. For a long string, a lot of stack is
|
||||
required. Consider now this rewritten pattern, which matches exactly the same
|
||||
strings:
|
||||
<pre>
|
||||
([^<]++|<(?!inet))
|
||||
([^<]++|<(?!inet))+
|
||||
</pre>
|
||||
This uses very much less stack, because runs of characters that do not contain
|
||||
"<" are "swallowed" in one item inside the parentheses. Recursion happens only
|
||||
@@ -73,33 +73,30 @@ backtracking into the runs of non-"<" characters, but that is not related to
|
||||
stack usage.
|
||||
</P>
|
||||
<P>
|
||||
This example shows that one way of avoiding stack problems when matching long
|
||||
subject strings is to write repeated parenthesized subpatterns to match more
|
||||
than one character whenever possible.
|
||||
</P>
|
||||
<br><b>
|
||||
Compiling PCRE to use heap instead of stack
|
||||
</b><br>
|
||||
<P>
|
||||
In environments where stack memory is constrained, you might want to compile
|
||||
PCRE to use heap memory instead of stack for remembering back-up points. This
|
||||
makes it run a lot more slowly, however. Details of how to do this are given in
|
||||
the
|
||||
<a href="pcrebuild.html"><b>pcrebuild</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<P>
|
||||
In Unix-like environments, there is not often a problem with the stack, though
|
||||
the default limit on stack size varies from system to system. Values from 8Mb
|
||||
to 64Mb are common. You can find your default limit by running the command:
|
||||
<pre>
|
||||
ulimit -s
|
||||
</pre>
|
||||
The effect of running out of stack is often SIGSEGV, though sometimes an error
|
||||
message is given. You can normally increase the limit on stack size by code
|
||||
such as this:
|
||||
<pre>
|
||||
struct rlimit rlim;
|
||||
getrlimit(RLIMIT_STACK, &rlim);
|
||||
rlim.rlim_cur = 100*1024*1024;
|
||||
setrlimit(RLIMIT_STACK, &rlim);
|
||||
</pre>
|
||||
This reads the current limits (soft and hard) using <b>getrlimit()</b>, then
|
||||
attempts to increase the soft limit to 100Mb using <b>setrlimit()</b>. You must
|
||||
do this before calling <b>pcre_exec()</b>.
|
||||
documentation. When built in this way, instead of using the stack, PCRE obtains
|
||||
and frees memory by calling the functions that are pointed to by the
|
||||
<b>pcre_stack_malloc</b> and <b>pcre_stack_free</b> variables. By default, these
|
||||
point to <b>malloc()</b> and <b>free()</b>, but you can replace the pointers to
|
||||
cause PCRE to use your own functions. Since the block sizes are always the
|
||||
same, and are always freed in reverse order, it may be possible to implement
|
||||
customized memory handlers that are more efficient than the standard functions.
|
||||
</P>
|
||||
<br><b>
|
||||
Limiting PCRE's stack usage
|
||||
</b><br>
|
||||
<P>
|
||||
PCRE has an internal counter that can be used to limit the depth of recursion,
|
||||
and thus cause <b>pcre_exec()</b> to give an error code before it runs out of
|
||||
@@ -116,12 +113,60 @@ As a very rough rule of thumb, you should reckon on about 500 bytes per
|
||||
recursion. Thus, if you want to limit your stack usage to 8Mb, you
|
||||
should set the limit at 16000 recursions. A 64Mb stack, on the other hand, can
|
||||
support around 128000 recursions. The <b>pcretest</b> test program has a command
|
||||
line option (<b>-S</b>) that can be used to increase its stack.
|
||||
line option (<b>-S</b>) that can be used to increase the size of its stack.
|
||||
</P>
|
||||
<br><b>
|
||||
Changing stack size in Unix-like systems
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 29 June 2006
|
||||
In Unix-like environments, there is not often a problem with the stack unless
|
||||
very long strings are involved, though the default limit on stack size varies
|
||||
from system to system. Values from 8Mb to 64Mb are common. You can find your
|
||||
default limit by running the command:
|
||||
<pre>
|
||||
ulimit -s
|
||||
</pre>
|
||||
Unfortunately, the effect of running out of stack is often SIGSEGV, though
|
||||
sometimes a more explicit error message is given. You can normally increase the
|
||||
limit on stack size by code such as this:
|
||||
<pre>
|
||||
struct rlimit rlim;
|
||||
getrlimit(RLIMIT_STACK, &rlim);
|
||||
rlim.rlim_cur = 100*1024*1024;
|
||||
setrlimit(RLIMIT_STACK, &rlim);
|
||||
</pre>
|
||||
This reads the current limits (soft and hard) using <b>getrlimit()</b>, then
|
||||
attempts to increase the soft limit to 100Mb using <b>setrlimit()</b>. You must
|
||||
do this before calling <b>pcre_exec()</b>.
|
||||
</P>
|
||||
<br><b>
|
||||
Changing stack size in Mac OS X
|
||||
</b><br>
|
||||
<P>
|
||||
Using <b>setrlimit()</b>, as described above, should also work on Mac OS X. It
|
||||
is also possible to set a stack size when linking a program. There is a
|
||||
discussion about stack sizes in Mac OS X at this web site:
|
||||
<a href="http://developer.apple.com/qa/qa2005/qa1419.html">http://developer.apple.com/qa/qa2005/qa1419.html.</a>
|
||||
</P>
|
||||
<br><b>
|
||||
AUTHOR
|
||||
</b><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
<br>
|
||||
Cambridge CB2 3QH, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><b>
|
||||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 09 July 2008
|
||||
<br>
|
||||
Copyright © 1997-2008 University of Cambridge.
|
||||
<br>
|
||||
Copyright © 1997-2006 University of Cambridge.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
|
||||
@@ -0,0 +1,473 @@
|
||||
<html>
|
||||
<head>
|
||||
<title>pcresyntax specification</title>
|
||||
</head>
|
||||
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||
<h1>pcresyntax man page</h1>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
<p>
|
||||
This page is part of the PCRE HTML documentation. It was generated automatically
|
||||
from the original man page. If there is any nonsense in it, please consult the
|
||||
man page, in case the conversion went wrong.
|
||||
<br>
|
||||
<ul>
|
||||
<li><a name="TOC1" href="#SEC1">PCRE REGULAR EXPRESSION SYNTAX SUMMARY</a>
|
||||
<li><a name="TOC2" href="#SEC2">QUOTING</a>
|
||||
<li><a name="TOC3" href="#SEC3">CHARACTERS</a>
|
||||
<li><a name="TOC4" href="#SEC4">CHARACTER TYPES</a>
|
||||
<li><a name="TOC5" href="#SEC5">GENERAL CATEGORY PROPERTY CODES FOR \p and \P</a>
|
||||
<li><a name="TOC6" href="#SEC6">SCRIPT NAMES FOR \p AND \P</a>
|
||||
<li><a name="TOC7" href="#SEC7">CHARACTER CLASSES</a>
|
||||
<li><a name="TOC8" href="#SEC8">QUANTIFIERS</a>
|
||||
<li><a name="TOC9" href="#SEC9">ANCHORS AND SIMPLE ASSERTIONS</a>
|
||||
<li><a name="TOC10" href="#SEC10">MATCH POINT RESET</a>
|
||||
<li><a name="TOC11" href="#SEC11">ALTERNATION</a>
|
||||
<li><a name="TOC12" href="#SEC12">CAPTURING</a>
|
||||
<li><a name="TOC13" href="#SEC13">ATOMIC GROUPS</a>
|
||||
<li><a name="TOC14" href="#SEC14">COMMENT</a>
|
||||
<li><a name="TOC15" href="#SEC15">OPTION SETTING</a>
|
||||
<li><a name="TOC16" href="#SEC16">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
|
||||
<li><a name="TOC17" href="#SEC17">BACKREFERENCES</a>
|
||||
<li><a name="TOC18" href="#SEC18">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
|
||||
<li><a name="TOC19" href="#SEC19">CONDITIONAL PATTERNS</a>
|
||||
<li><a name="TOC20" href="#SEC20">BACKTRACKING CONTROL</a>
|
||||
<li><a name="TOC21" href="#SEC21">NEWLINE CONVENTIONS</a>
|
||||
<li><a name="TOC22" href="#SEC22">WHAT \R MATCHES</a>
|
||||
<li><a name="TOC23" href="#SEC23">CALLOUTS</a>
|
||||
<li><a name="TOC24" href="#SEC24">SEE ALSO</a>
|
||||
<li><a name="TOC25" href="#SEC25">AUTHOR</a>
|
||||
<li><a name="TOC26" href="#SEC26">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">PCRE REGULAR EXPRESSION SYNTAX SUMMARY</a><br>
|
||||
<P>
|
||||
The full syntax and semantics of the regular expressions that are supported by
|
||||
PCRE are described in the
|
||||
<a href="pcrepattern.html"><b>pcrepattern</b></a>
|
||||
documentation. This document contains just a quick-reference summary of the
|
||||
syntax.
|
||||
</P>
|
||||
<br><a name="SEC2" href="#TOC1">QUOTING</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\x where x is non-alphanumeric is a literal x
|
||||
\Q...\E treat enclosed characters as literal
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC3" href="#TOC1">CHARACTERS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\a alarm, that is, the BEL character (hex 07)
|
||||
\cx "control-x", where x is any character
|
||||
\e escape (hex 1B)
|
||||
\f formfeed (hex 0C)
|
||||
\n newline (hex 0A)
|
||||
\r carriage return (hex 0D)
|
||||
\t tab (hex 09)
|
||||
\ddd character with octal code ddd, or backreference
|
||||
\xhh character with hex code hh
|
||||
\x{hhh..} character with hex code hhh..
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">CHARACTER TYPES</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
. any character except newline;
|
||||
in dotall mode, any character whatsoever
|
||||
\C one byte, even in UTF-8 mode (best avoided)
|
||||
\d a decimal digit
|
||||
\D a character that is not a decimal digit
|
||||
\h a horizontal whitespace character
|
||||
\H a character that is not a horizontal whitespace character
|
||||
\p{<i>xx</i>} a character with the <i>xx</i> property
|
||||
\P{<i>xx</i>} a character without the <i>xx</i> property
|
||||
\R a newline sequence
|
||||
\s a whitespace character
|
||||
\S a character that is not a whitespace character
|
||||
\v a vertical whitespace character
|
||||
\V a character that is not a vertical whitespace character
|
||||
\w a "word" character
|
||||
\W a "non-word" character
|
||||
\X an extended Unicode sequence
|
||||
</pre>
|
||||
In PCRE, \d, \D, \s, \S, \w, and \W recognize only ASCII characters.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">GENERAL CATEGORY PROPERTY CODES FOR \p and \P</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
C Other
|
||||
Cc Control
|
||||
Cf Format
|
||||
Cn Unassigned
|
||||
Co Private use
|
||||
Cs Surrogate
|
||||
|
||||
L Letter
|
||||
Ll Lower case letter
|
||||
Lm Modifier letter
|
||||
Lo Other letter
|
||||
Lt Title case letter
|
||||
Lu Upper case letter
|
||||
L& Ll, Lu, or Lt
|
||||
|
||||
M Mark
|
||||
Mc Spacing mark
|
||||
Me Enclosing mark
|
||||
Mn Non-spacing mark
|
||||
|
||||
N Number
|
||||
Nd Decimal number
|
||||
Nl Letter number
|
||||
No Other number
|
||||
|
||||
P Punctuation
|
||||
Pc Connector punctuation
|
||||
Pd Dash punctuation
|
||||
Pe Close punctuation
|
||||
Pf Final punctuation
|
||||
Pi Initial punctuation
|
||||
Po Other punctuation
|
||||
Ps Open punctuation
|
||||
|
||||
S Symbol
|
||||
Sc Currency symbol
|
||||
Sk Modifier symbol
|
||||
Sm Mathematical symbol
|
||||
So Other symbol
|
||||
|
||||
Z Separator
|
||||
Zl Line separator
|
||||
Zp Paragraph separator
|
||||
Zs Space separator
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">SCRIPT NAMES FOR \p AND \P</a><br>
|
||||
<P>
|
||||
Arabic,
|
||||
Armenian,
|
||||
Balinese,
|
||||
Bengali,
|
||||
Bopomofo,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanunoo,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Inherited,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khmer,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_B,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Malayalam,
|
||||
Mongolian,
|
||||
Myanmar,
|
||||
New_Tai_Lue,
|
||||
Nko,
|
||||
Ogham,
|
||||
Old_Italic,
|
||||
Old_Persian,
|
||||
Ol_Chiki,
|
||||
Oriya,
|
||||
Osmanya,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Rejang,
|
||||
Runic,
|
||||
Saurashtra,
|
||||
Shavian,
|
||||
Sinhala,
|
||||
Sudanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tamil,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Ugaritic,
|
||||
Vai,
|
||||
Yi.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">CHARACTER CLASSES</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
[...] positive character class
|
||||
[^...] negative character class
|
||||
[x-y] range (can be used for hex characters)
|
||||
[[:xxx:]] positive POSIX named set
|
||||
[[:^xxx:]] negative POSIX named set
|
||||
|
||||
alnum alphanumeric
|
||||
alpha alphabetic
|
||||
ascii 0-127
|
||||
blank space or tab
|
||||
cntrl control character
|
||||
digit decimal digit
|
||||
graph printing, excluding space
|
||||
lower lower case letter
|
||||
print printing, including space
|
||||
punct printing, excluding alphanumeric
|
||||
space whitespace
|
||||
upper upper case letter
|
||||
word same as \w
|
||||
xdigit hexadecimal digit
|
||||
</pre>
|
||||
In PCRE, POSIX character set names recognize only ASCII characters. You can use
|
||||
\Q...\E inside a character class.
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">QUANTIFIERS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
? 0 or 1, greedy
|
||||
?+ 0 or 1, possessive
|
||||
?? 0 or 1, lazy
|
||||
* 0 or more, greedy
|
||||
*+ 0 or more, possessive
|
||||
*? 0 or more, lazy
|
||||
+ 1 or more, greedy
|
||||
++ 1 or more, possessive
|
||||
+? 1 or more, lazy
|
||||
{n} exactly n
|
||||
{n,m} at least n, no more than m, greedy
|
||||
{n,m}+ at least n, no more than m, possessive
|
||||
{n,m}? at least n, no more than m, lazy
|
||||
{n,} n or more, greedy
|
||||
{n,}+ n or more, possessive
|
||||
{n,}? n or more, lazy
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\b word boundary (only ASCII letters recognized)
|
||||
\B not a word boundary
|
||||
^ start of subject
|
||||
also after internal newline in multiline mode
|
||||
\A start of subject
|
||||
$ end of subject
|
||||
also before newline at end of subject
|
||||
also before internal newline in multiline mode
|
||||
\Z end of subject
|
||||
also before newline at end of subject
|
||||
\z end of subject
|
||||
\G first matching position in subject
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">MATCH POINT RESET</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\K reset start of match
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC11" href="#TOC1">ALTERNATION</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
expr|expr|expr...
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">CAPTURING</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(...) capturing group
|
||||
(?<name>...) named capturing group (Perl)
|
||||
(?'name'...) named capturing group (Perl)
|
||||
(?P<name>...) named capturing group (Python)
|
||||
(?:...) non-capturing group
|
||||
(?|...) non-capturing group; reset group numbers for
|
||||
capturing groups in each alternative
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">ATOMIC GROUPS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?>...) atomic, non-capturing group
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">COMMENT</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?#....) comment (not nestable)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC15" href="#TOC1">OPTION SETTING</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?i) caseless
|
||||
(?J) allow duplicate names
|
||||
(?m) multiline
|
||||
(?s) single line (dotall)
|
||||
(?U) default ungreedy (lazy)
|
||||
(?x) extended (ignore white space)
|
||||
(?-...) unset option(s)
|
||||
</pre>
|
||||
The following is recognized only at the start of a pattern or after one of the
|
||||
newline-setting options with similar syntax:
|
||||
<pre>
|
||||
(*UTF8) set UTF-8 mode
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?=...) positive look ahead
|
||||
(?!...) negative look ahead
|
||||
(?<=...) positive look behind
|
||||
(?<!...) negative look behind
|
||||
</pre>
|
||||
Each top-level branch of a look behind must be of a fixed length.
|
||||
</P>
|
||||
<br><a name="SEC17" href="#TOC1">BACKREFERENCES</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\n reference by number (can be ambiguous)
|
||||
\gn reference by number
|
||||
\g{n} reference by number
|
||||
\g{-n} relative reference by number
|
||||
\k<name> reference by name (Perl)
|
||||
\k'name' reference by name (Perl)
|
||||
\g{name} reference by name (Perl)
|
||||
\k{name} reference by name (.NET)
|
||||
(?P=name) reference by name (Python)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC18" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?R) recurse whole pattern
|
||||
(?n) call subpattern by absolute number
|
||||
(?+n) call subpattern by relative number
|
||||
(?-n) call subpattern by relative number
|
||||
(?&name) call subpattern by name (Perl)
|
||||
(?P>name) call subpattern by name (Python)
|
||||
\g<name> call subpattern by name (Oniguruma)
|
||||
\g'name' call subpattern by name (Oniguruma)
|
||||
\g<n> call subpattern by absolute number (Oniguruma)
|
||||
\g'n' call subpattern by absolute number (Oniguruma)
|
||||
\g<+n> call subpattern by relative number (PCRE extension)
|
||||
\g'+n' call subpattern by relative number (PCRE extension)
|
||||
\g<-n> call subpattern by relative number (PCRE extension)
|
||||
\g'-n' call subpattern by relative number (PCRE extension)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC19" href="#TOC1">CONDITIONAL PATTERNS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?(condition)yes-pattern)
|
||||
(?(condition)yes-pattern|no-pattern)
|
||||
|
||||
(?(n)... absolute reference condition
|
||||
(?(+n)... relative reference condition
|
||||
(?(-n)... relative reference condition
|
||||
(?(<name>)... named reference condition (Perl)
|
||||
(?('name')... named reference condition (Perl)
|
||||
(?(name)... named reference condition (PCRE)
|
||||
(?(R)... overall recursion condition
|
||||
(?(Rn)... specific group recursion condition
|
||||
(?(R&name)... specific recursion condition
|
||||
(?(DEFINE)... define subpattern for reference
|
||||
(?(assert)... assertion condition
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC20" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||
<P>
|
||||
The following act immediately they are reached:
|
||||
<pre>
|
||||
(*ACCEPT) force successful match
|
||||
(*FAIL) force backtrack; synonym (*F)
|
||||
</pre>
|
||||
The following act only when a subsequent match failure causes a backtrack to
|
||||
reach them. They all force a match failure, but they differ in what happens
|
||||
afterwards. Those that advance the start-of-match point do so only if the
|
||||
pattern is not anchored.
|
||||
<pre>
|
||||
(*COMMIT) overall failure, no advance of starting point
|
||||
(*PRUNE) advance to next starting character
|
||||
(*SKIP) advance start to current matching position
|
||||
(*THEN) local failure, backtrack to next alternation
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">NEWLINE CONVENTIONS</a><br>
|
||||
<P>
|
||||
These are recognized only at the very start of the pattern or after a
|
||||
(*BSR_...) or (*UTF8) option.
|
||||
<pre>
|
||||
(*CR) carriage return only
|
||||
(*LF) linefeed only
|
||||
(*CRLF) carriage return followed by linefeed
|
||||
(*ANYCRLF) all three of the above
|
||||
(*ANY) any Unicode newline sequence
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC22" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||
<P>
|
||||
These are recognized only at the very start of the pattern or after a
|
||||
(*...) option that sets the newline convention or UTF-8 mode.
|
||||
<pre>
|
||||
(*BSR_ANYCRLF) CR, LF, or CRLF
|
||||
(*BSR_UNICODE) any Unicode newline sequence
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC23" href="#TOC1">CALLOUTS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?C) callout
|
||||
(?Cn) callout with data n
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC24" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcrepattern</b>(3), <b>pcreapi</b>(3), <b>pcrecallout</b>(3),
|
||||
<b>pcrematching</b>(3), <b>pcre</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC25" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
<br>
|
||||
Cambridge CB2 3QH, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC26" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 11 April 2009
|
||||
<br>
|
||||
Copyright © 1997-2009 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
@@ -23,8 +23,11 @@ man page, in case the conversion went wrong.
|
||||
<li><a name="TOC8" href="#SEC8">OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION</a>
|
||||
<li><a name="TOC9" href="#SEC9">RESTARTING AFTER A PARTIAL MATCH</a>
|
||||
<li><a name="TOC10" href="#SEC10">CALLOUTS</a>
|
||||
<li><a name="TOC11" href="#SEC11">SAVING AND RELOADING COMPILED PATTERNS</a>
|
||||
<li><a name="TOC12" href="#SEC12">AUTHOR</a>
|
||||
<li><a name="TOC11" href="#SEC11">NON-PRINTING CHARACTERS</a>
|
||||
<li><a name="TOC12" href="#SEC12">SAVING AND RELOADING COMPILED PATTERNS</a>
|
||||
<li><a name="TOC13" href="#SEC13">SEE ALSO</a>
|
||||
<li><a name="TOC14" href="#SEC14">AUTHOR</a>
|
||||
<li><a name="TOC15" href="#SEC15">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
|
||||
<P>
|
||||
@@ -43,6 +46,11 @@ documentation.
|
||||
</P>
|
||||
<br><a name="SEC2" href="#TOC1">OPTIONS</a><br>
|
||||
<P>
|
||||
<b>-b</b>
|
||||
Behave as if each regex has the <b>/B</b> (show bytecode) modifier; the internal
|
||||
form is output after compilation.
|
||||
</P>
|
||||
<P>
|
||||
<b>-C</b>
|
||||
Output the version number of the PCRE library, and all available information
|
||||
about the optional features that are included, and then exit.
|
||||
@@ -50,7 +58,8 @@ about the optional features that are included, and then exit.
|
||||
<P>
|
||||
<b>-d</b>
|
||||
Behave as if each regex has the <b>/D</b> (debug) modifier; the internal
|
||||
form is output after compilation.
|
||||
form and information about the compiled pattern is output after compilation;
|
||||
<b>-d</b> is equivalent to <b>-b -i</b>.
|
||||
</P>
|
||||
<P>
|
||||
<b>-dfa</b>
|
||||
@@ -59,11 +68,21 @@ alternative matching function, <b>pcre_dfa_exec()</b>, to be used instead of the
|
||||
standard <b>pcre_exec()</b> function (more detail is given below).
|
||||
</P>
|
||||
<P>
|
||||
<b>-help</b>
|
||||
Output a brief summary these options and then exit.
|
||||
</P>
|
||||
<P>
|
||||
<b>-i</b>
|
||||
Behave as if each regex has the <b>/I</b> modifier; information about the
|
||||
compiled pattern is given after compilation.
|
||||
</P>
|
||||
<P>
|
||||
<b>-M</b>
|
||||
Behave as if each data line contains the \M escape sequence; this causes
|
||||
PCRE to discover the minimum MATCH_LIMIT and MATCH_LIMIT_RECURSION settings by
|
||||
calling <b>pcre_exec()</b> repeatedly with different limits.
|
||||
</P>
|
||||
<P>
|
||||
<b>-m</b>
|
||||
Output the size of each compiled pattern after it has been compiled. This is
|
||||
equivalent to adding <b>/M</b> to each regular expression. For compatibility
|
||||
@@ -72,9 +91,11 @@ with earlier versions of pcretest, <b>-s</b> is a synonym for <b>-m</b>.
|
||||
<P>
|
||||
<b>-o</b> <i>osize</i>
|
||||
Set the number of elements in the output vector that is used when calling
|
||||
<b>pcre_exec()</b> to be <i>osize</i>. The default value is 45, which is enough
|
||||
for 14 capturing subexpressions. The vector size can be changed for individual
|
||||
matching calls by including \O in the data line (see below).
|
||||
<b>pcre_exec()</b> or <b>pcre_dfa_exec()</b> to be <i>osize</i>. The default value
|
||||
is 45, which is enough for 14 capturing subexpressions for <b>pcre_exec()</b> or
|
||||
22 different matches for <b>pcre_dfa_exec()</b>. The vector size can be
|
||||
changed for individual matching calls by including \O in the data line (see
|
||||
below).
|
||||
</P>
|
||||
<P>
|
||||
<b>-p</b>
|
||||
@@ -96,7 +117,15 @@ megabytes.
|
||||
Run each compile, study, and match many times with a timer, and output
|
||||
resulting time per compile or match (in milliseconds). Do not set <b>-m</b> with
|
||||
<b>-t</b>, because you will then get the size output a zillion times, and the
|
||||
timing will be distorted.
|
||||
timing will be distorted. You can control the number of iterations that are
|
||||
used for timing by following <b>-t</b> with a number (as a separate item on the
|
||||
command line). For example, "-t 1000" would iterate 1000 times. The default is
|
||||
to iterate 500000 times.
|
||||
</P>
|
||||
<P>
|
||||
<b>-tm</b>
|
||||
This is like <b>-t</b> except that it times only the matching phase, not the
|
||||
compile or study phases.
|
||||
</P>
|
||||
<br><a name="SEC3" href="#TOC1">DESCRIPTION</a><br>
|
||||
<P>
|
||||
@@ -107,6 +136,13 @@ stdout, and prompts for each line of input, using "re>" to prompt for regula
|
||||
expressions, and "data>" to prompt for data lines.
|
||||
</P>
|
||||
<P>
|
||||
When <b>pcretest</b> is built, a configuration option can specify that it should
|
||||
be linked with the <b>libreadline</b> library. When this is done, if the input
|
||||
is from a terminal, it is read using the <b>readline()</b> function. This
|
||||
provides line-editing and history facilities. The output from the <b>-help</b>
|
||||
option states whether or not <b>readline()</b> will be used.
|
||||
</P>
|
||||
<P>
|
||||
The program handles any number of sets of input on a single input file. Each
|
||||
set starts with a regular expression, and continues with any number of data
|
||||
lines to be matched against the pattern.
|
||||
@@ -114,8 +150,8 @@ lines to be matched against the pattern.
|
||||
<P>
|
||||
Each data line is matched separately and independently. If you want to do
|
||||
multi-line matches, you have to use the \n escape sequence (or \r or \r\n,
|
||||
depending on the newline setting) in a single line of input to encode the
|
||||
newline characters. There is no limit on the length of data lines; the input
|
||||
etc., depending on the newline setting) in a single line of input to encode the
|
||||
newline sequences. There is no limit on the length of data lines; the input
|
||||
buffer is automatically extended if it is too small.
|
||||
</P>
|
||||
<P>
|
||||
@@ -168,20 +204,30 @@ effect as they do in Perl. For example:
|
||||
The following table shows additional modifiers for setting PCRE options that do
|
||||
not correspond to anything in Perl:
|
||||
<pre>
|
||||
<b>/A</b> PCRE_ANCHORED
|
||||
<b>/C</b> PCRE_AUTO_CALLOUT
|
||||
<b>/E</b> PCRE_DOLLAR_ENDONLY
|
||||
<b>/f</b> PCRE_FIRSTLINE
|
||||
<b>/J</b> PCRE_DUPNAMES
|
||||
<b>/N</b> PCRE_NO_AUTO_CAPTURE
|
||||
<b>/U</b> PCRE_UNGREEDY
|
||||
<b>/X</b> PCRE_EXTRA
|
||||
<b>/<cr></b> PCRE_NEWLINE_CR
|
||||
<b>/<lf></b> PCRE_NEWLINE_LF
|
||||
<b>/<crlf></b> PCRE_NEWLINE_CRLF
|
||||
<b>/A</b> PCRE_ANCHORED
|
||||
<b>/C</b> PCRE_AUTO_CALLOUT
|
||||
<b>/E</b> PCRE_DOLLAR_ENDONLY
|
||||
<b>/f</b> PCRE_FIRSTLINE
|
||||
<b>/J</b> PCRE_DUPNAMES
|
||||
<b>/N</b> PCRE_NO_AUTO_CAPTURE
|
||||
<b>/U</b> PCRE_UNGREEDY
|
||||
<b>/X</b> PCRE_EXTRA
|
||||
<b>/<JS></b> PCRE_JAVASCRIPT_COMPAT
|
||||
<b>/<cr></b> PCRE_NEWLINE_CR
|
||||
<b>/<lf></b> PCRE_NEWLINE_LF
|
||||
<b>/<crlf></b> PCRE_NEWLINE_CRLF
|
||||
<b>/<anycrlf></b> PCRE_NEWLINE_ANYCRLF
|
||||
<b>/<any></b> PCRE_NEWLINE_ANY
|
||||
<b>/<bsr_anycrlf></b> PCRE_BSR_ANYCRLF
|
||||
<b>/<bsr_unicode></b> PCRE_BSR_UNICODE
|
||||
</pre>
|
||||
Those specifying line endings are literal strings as shown. Details of the
|
||||
meanings of these PCRE options are given in the
|
||||
Those specifying line ending sequences are literal strings as shown, but the
|
||||
letters can be in either case. This example sets multiline matching with CRLF
|
||||
as the line ending sequence:
|
||||
<pre>
|
||||
/^abc/m<crlf>
|
||||
</pre>
|
||||
Details of the meanings of these PCRE options are given in the
|
||||
<a href="pcreapi.html"><b>pcreapi</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
@@ -220,6 +266,14 @@ the subject string. This is useful for tests where the subject contains
|
||||
multiple copies of the same substring.
|
||||
</P>
|
||||
<P>
|
||||
The <b>/B</b> modifier is a debugging feature. It requests that <b>pcretest</b>
|
||||
output a representation of the compiled byte code after compilation. Normally
|
||||
this information contains length and offset values; however, if <b>/Z</b> is
|
||||
also present, this data is replaced by spaces. This is a special feature for
|
||||
use in the automatic test scripts; it ensures that the same output is generated
|
||||
for different internal link sizes.
|
||||
</P>
|
||||
<P>
|
||||
The <b>/L</b> modifier must be followed directly by the name of a locale, for
|
||||
example,
|
||||
<pre>
|
||||
@@ -238,10 +292,8 @@ so on). It does this by calling <b>pcre_fullinfo()</b> after compiling a
|
||||
pattern. If the pattern is studied, the results of that are also output.
|
||||
</P>
|
||||
<P>
|
||||
The <b>/D</b> modifier is a PCRE debugging feature, which also assumes <b>/I</b>.
|
||||
It causes the internal form of compiled regular expressions to be output after
|
||||
compilation. If the pattern was studied, the information returned is also
|
||||
output.
|
||||
The <b>/D</b> modifier is a PCRE debugging feature, and is equivalent to
|
||||
<b>/BI</b>, that is, both the <b>/B</b> and the <b>/I</b> modifiers.
|
||||
</P>
|
||||
<P>
|
||||
The <b>/F</b> modifier causes <b>pcretest</b> to flip the byte order of the
|
||||
@@ -289,15 +341,15 @@ complicated features of PCRE. If you are just testing "ordinary" regular
|
||||
expressions, you probably don't need any of these. The following escapes are
|
||||
recognized:
|
||||
<pre>
|
||||
\a alarm (= BEL)
|
||||
\b backspace
|
||||
\e escape
|
||||
\f formfeed
|
||||
\n newline
|
||||
\a alarm (BEL, \x07)
|
||||
\b backspace (\x08)
|
||||
\e escape (\x27)
|
||||
\f formfeed (\x0c)
|
||||
\n newline (\x0a)
|
||||
\qdd set the PCRE_MATCH_LIMIT limit to dd (any number of digits)
|
||||
\r carriage return
|
||||
\t tab
|
||||
\v vertical tab
|
||||
\r carriage return (\x0d)
|
||||
\t tab (\x09)
|
||||
\v vertical tab (\x0b)
|
||||
\nnn octal character (up to 3 octal digits)
|
||||
\xhh hexadecimal character (up to 2 hex digits)
|
||||
\x{hh...} hexadecimal character, any number of digits in UTF-8 mode
|
||||
@@ -331,11 +383,17 @@ recognized:
|
||||
\<cr> pass the PCRE_NEWLINE_CR option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
|
||||
\<lf> pass the PCRE_NEWLINE_LF option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
|
||||
\<crlf> pass the PCRE_NEWLINE_CRLF option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
|
||||
\<anycrlf> pass the PCRE_NEWLINE_ANYCRLF option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
|
||||
\<any> pass the PCRE_NEWLINE_ANY option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>
|
||||
</pre>
|
||||
The escapes that specify line endings are literal strings, exactly as shown.
|
||||
A backslash followed by anything else just escapes the anything else. If the
|
||||
very last character is a backslash, it is ignored. This gives a way of passing
|
||||
an empty line as data, since a real empty line terminates the data input.
|
||||
The escapes that specify line ending sequences are literal strings, exactly as
|
||||
shown. No more than one newline setting should be present in any data line.
|
||||
</P>
|
||||
<P>
|
||||
A backslash followed by anything else just escapes the anything else. If
|
||||
the very last character is a backslash, it is ignored. This gives a way of
|
||||
passing an empty line as data, since a real empty line terminates the data
|
||||
input.
|
||||
</P>
|
||||
<P>
|
||||
If \M is present, <b>pcretest</b> calls <b>pcre_exec()</b> several times, with
|
||||
@@ -365,7 +423,10 @@ and \Z, causing REG_NOTBOL and REG_NOTEOL, respectively, to be passed to
|
||||
The use of \x{hh...} to represent UTF-8 characters is not dependent on the use
|
||||
of the <b>/8</b> modifier on the pattern. It is recognized always. There may be
|
||||
any number of hexadecimal digits inside the braces. The result is from one to
|
||||
six bytes, encoded according to the UTF-8 rules.
|
||||
six bytes, encoded according to the original UTF-8 rules of RFC 2279. This
|
||||
allows for values in the range 0 to 0x7FFFFFFF. Note that not all of those are
|
||||
valid Unicode code points, or indeed valid UTF-8 characters according to the
|
||||
later rules in RFC 3629.
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
||||
<P>
|
||||
@@ -398,7 +459,7 @@ respectively, and otherwise the PCRE negative error number. Here is an example
|
||||
of an interactive <b>pcretest</b> run.
|
||||
<pre>
|
||||
$ pcretest
|
||||
PCRE version 5.00 07-Sep-2004
|
||||
PCRE version 7.0 30-Nov-2006
|
||||
|
||||
re> /^abc(\d+)/
|
||||
data> abc123
|
||||
@@ -407,11 +468,26 @@ of an interactive <b>pcretest</b> run.
|
||||
data> xyz
|
||||
No match
|
||||
</pre>
|
||||
Note that unset capturing substrings that are not followed by one that is set
|
||||
are not returned by <b>pcre_exec()</b>, and are not shown by <b>pcretest</b>. In
|
||||
the following example, there are two capturing substrings, but when the first
|
||||
data line is matched, the second, unset substring is not shown. An "internal"
|
||||
unset substring is shown as "<unset>", as for the second data line.
|
||||
<pre>
|
||||
re> /(a)|(b)/
|
||||
data> a
|
||||
0: a
|
||||
1: a
|
||||
data> b
|
||||
0: b
|
||||
1: <unset>
|
||||
2: b
|
||||
</pre>
|
||||
If the strings contain any non-printing characters, they are output as \0x
|
||||
escapes, or as \x{...} escapes if the <b>/8</b> modifier was present on the
|
||||
pattern. If the pattern has the <b>/+</b> modifier, the output for substring 0
|
||||
is followed by the the rest of the subject string, identified by "0+" like
|
||||
this:
|
||||
pattern. See below for the definition of non-printing characters. If the
|
||||
pattern has the <b>/+</b> modifier, the output for substring 0 is followed by
|
||||
the the rest of the subject string, identified by "0+" like this:
|
||||
<pre>
|
||||
re> /cat/+
|
||||
data> cataract
|
||||
@@ -441,10 +517,10 @@ length (that is, the return from the extraction function) is given in
|
||||
parentheses after each string for <b>\C</b> and <b>\G</b>.
|
||||
</P>
|
||||
<P>
|
||||
Note that while patterns can be continued over several lines (a plain ">"
|
||||
Note that whereas patterns can be continued over several lines (a plain ">"
|
||||
prompt is used for continuations), data lines may not. However newlines can be
|
||||
included in data by means of the \n escape (or \r or \r\n for those newline
|
||||
settings).
|
||||
included in data by means of the \n escape (or \r, \r\n, etc., depending on
|
||||
the newline sequence setting).
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
||||
<P>
|
||||
@@ -463,7 +539,7 @@ the subject where there is at least one match. For example:
|
||||
longest matching string is always given first (and numbered zero).
|
||||
</P>
|
||||
<P>
|
||||
If \fB/g\P is present on the pattern, the search for further matches resumes
|
||||
If <b>/g</b> is present on the pattern, the search for further matches resumes
|
||||
at the end of the longest match. For example:
|
||||
<pre>
|
||||
re> /(tang|tangerine|tan)/g
|
||||
@@ -537,7 +613,19 @@ the
|
||||
<a href="pcrecallout.html"><b>pcrecallout</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<br><a name="SEC11" href="#TOC1">SAVING AND RELOADING COMPILED PATTERNS</a><br>
|
||||
<br><a name="SEC11" href="#TOC1">NON-PRINTING CHARACTERS</a><br>
|
||||
<P>
|
||||
When <b>pcretest</b> is outputting text in the compiled version of a pattern,
|
||||
bytes other than 32-126 are always treated as non-printing characters are are
|
||||
therefore shown as hex escapes.
|
||||
</P>
|
||||
<P>
|
||||
When <b>pcretest</b> is outputting text that is a matched part of a subject
|
||||
string, it behaves in the same way, unless a different locale has been set for
|
||||
the pattern (using the <b>/L</b> modifier). In this case, the <b>isprint()</b>
|
||||
function to distinguish printing and non-printing characters.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">SAVING AND RELOADING COMPILED PATTERNS</a><br>
|
||||
<P>
|
||||
The facilities described in this section are not available when the POSIX
|
||||
inteface to PCRE is being used, that is, when the <b>/P</b> pattern modifier is
|
||||
@@ -599,18 +687,26 @@ string using a reloaded pattern is likely to cause <b>pcretest</b> to crash.
|
||||
Finally, if you attempt to load a file that is not in the correct format, the
|
||||
result is undefined.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">AUTHOR</a><br>
|
||||
<br><a name="SEC13" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcre</b>(3), <b>pcreapi</b>(3), <b>pcrecallout</b>(3), <b>pcrematching</b>(3),
|
||||
<b>pcrepartial</b>(d), <b>pcrepattern</b>(3), <b>pcreprecompile</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service,
|
||||
University Computing Service
|
||||
<br>
|
||||
Cambridge CB2 3QH, England.
|
||||
<br>
|
||||
Cambridge CB2 3QG, England.
|
||||
</P>
|
||||
<br><a name="SEC15" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 29 June 2006
|
||||
Last updated: 10 March 2009
|
||||
<br>
|
||||
Copyright © 1997-2009 University of Cambridge.
|
||||
<br>
|
||||
Copyright © 1997-2006 University of Cambridge.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE index page</a>.
|
||||
</p>
|
||||
|
||||
@@ -0,0 +1,140 @@
|
||||
<html>
|
||||
<!-- This is a manually maintained file that is the root of the HTML version of
|
||||
the PCRE documentation. When the HTML documents are built from the man
|
||||
page versions, the entire doc/html directory is emptied, this file is then
|
||||
copied into doc/html/index.html, and the remaining files therein are
|
||||
created by the 132html script.
|
||||
-->
|
||||
<head>
|
||||
<title>PCRE specification</title>
|
||||
</head>
|
||||
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||
<h1>Perl-compatible Regular Expressions (PCRE)</h1>
|
||||
<p>
|
||||
The HTML documentation for PCRE comprises the following pages:
|
||||
</p>
|
||||
|
||||
<table>
|
||||
<tr><td><a href="pcre.html">pcre</a></td>
|
||||
<td> Introductory page</td></tr>
|
||||
|
||||
<tr><td><a href="pcre-config.html">pcre-config</a></td>
|
||||
<td> Information about the installation configuration</td></tr>
|
||||
|
||||
<tr><td><a href="pcreapi.html">pcreapi</a></td>
|
||||
<td> PCRE's native API</td></tr>
|
||||
|
||||
<tr><td><a href="pcrebuild.html">pcrebuild</a></td>
|
||||
<td> Options for building PCRE</td></tr>
|
||||
|
||||
<tr><td><a href="pcrecallout.html">pcrecallout</a></td>
|
||||
<td> The <i>callout</i> facility</td></tr>
|
||||
|
||||
<tr><td><a href="pcrecompat.html">pcrecompat</a></td>
|
||||
<td> Compability with Perl</td></tr>
|
||||
|
||||
<tr><td><a href="pcrecpp.html">pcrecpp</a></td>
|
||||
<td> The C++ wrapper for the PCRE library</td></tr>
|
||||
|
||||
<tr><td><a href="pcregrep.html">pcregrep</a></td>
|
||||
<td> The <b>pcregrep</b> command</td></tr>
|
||||
|
||||
<tr><td><a href="pcrematching.html">pcrematching</a></td>
|
||||
<td> Discussion of the two matching algorithms</td></tr>
|
||||
|
||||
<tr><td><a href="pcrepartial.html">pcrepartial</a></td>
|
||||
<td> Using PCRE for partial matching</td></tr>
|
||||
|
||||
<tr><td><a href="pcrepattern.html">pcrepattern</a></td>
|
||||
<td> Specification of the regular expressions supported by PCRE</td></tr>
|
||||
|
||||
<tr><td><a href="pcreperform.html">pcreperform</a></td>
|
||||
<td> Some comments on performance</td></tr>
|
||||
|
||||
<tr><td><a href="pcreposix.html">pcreposix</a></td>
|
||||
<td> The POSIX API to the PCRE library</td></tr>
|
||||
|
||||
<tr><td><a href="pcreprecompile.html">pcreprecompile</a></td>
|
||||
<td> How to save and re-use compiled patterns</td></tr>
|
||||
|
||||
<tr><td><a href="pcresample.html">pcresample</a></td>
|
||||
<td> Description of the sample program</td></tr>
|
||||
|
||||
<tr><td><a href="pcrestack.html">pcrestack</a></td>
|
||||
<td> Discussion of PCRE's stack usage</td></tr>
|
||||
|
||||
<tr><td><a href="pcresyntax.html">pcresyntax</a></td>
|
||||
<td> Syntax quick-reference summary</td></tr>
|
||||
|
||||
<tr><td><a href="pcretest.html">pcretest</a></td>
|
||||
<td> The <b>pcretest</b> command for testing PCRE</td></tr>
|
||||
</table>
|
||||
|
||||
<p>
|
||||
There are also individual pages that summarize the interface for each function
|
||||
in the library:
|
||||
</p>
|
||||
|
||||
<table>
|
||||
|
||||
<tr><td><a href="pcre_compile.html">pcre_compile</a></td>
|
||||
<td> Compile a regular expression</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_compile2.html">pcre_compile2</a></td>
|
||||
<td> Compile a regular expression (alternate interface)</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_config.html">pcre_config</a></td>
|
||||
<td> Show build-time configuration options</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_copy_named_substring.html">pcre_copy_named_substring</a></td>
|
||||
<td> Extract named substring into given buffer</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_copy_substring.html">pcre_copy_substring</a></td>
|
||||
<td> Extract numbered substring into given buffer</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_dfa_exec.html">pcre_dfa_exec</a></td>
|
||||
<td> Match a compiled pattern to a subject string
|
||||
(DFA algorithm; <i>not</i> Perl compatible)</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_exec.html">pcre_exec</a></td>
|
||||
<td> Match a compiled pattern to a subject string
|
||||
(Perl compatible)</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_free_substring.html">pcre_free_substring</a></td>
|
||||
<td> Free extracted substring</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_free_substring_list.html">pcre_free_substring_list</a></td>
|
||||
<td> Free list of extracted substrings</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_fullinfo.html">pcre_fullinfo</a></td>
|
||||
<td> Extract information about a pattern</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_get_named_substring.html">pcre_get_named_substring</a></td>
|
||||
<td> Extract named substring into new memory</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_get_stringnumber.html">pcre_get_stringnumber</a></td>
|
||||
<td> Convert captured string name to number</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_get_substring.html">pcre_get_substring</a></td>
|
||||
<td> Extract numbered substring into new memory</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_get_substring_list.html">pcre_get_substring_list</a></td>
|
||||
<td> Extract all substrings into new memory</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_info.html">pcre_info</a></td>
|
||||
<td> Obsolete information extraction function</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_maketables.html">pcre_maketables</a></td>
|
||||
<td> Build character tables in current locale</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_refcount.html">pcre_refcount</a></td>
|
||||
<td> Maintain reference count in compiled pattern</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_study.html">pcre_study</a></td>
|
||||
<td> Study a compiled pattern</td></tr>
|
||||
|
||||
<tr><td><a href="pcre_version.html">pcre_version</a></td>
|
||||
<td> Return PCRE version and release date</td></tr>
|
||||
</table>
|
||||
|
||||
</html>
|
||||
@@ -0,0 +1,73 @@
|
||||
.TH PCRE-CONFIG 1
|
||||
.SH NAME
|
||||
pcre-config - program to return PCRE configuration
|
||||
.SH SYNOPSIS
|
||||
.rs
|
||||
.sp
|
||||
.B pcre-config [--prefix] [--exec-prefix] [--version] [--libs]
|
||||
.ti +5n
|
||||
.B [--libs-posix] [--cflags] [--cflags-posix]
|
||||
.
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
\fBpcre-config\fP returns the configuration of the installed PCRE
|
||||
libraries and the options required to compile a program to use them.
|
||||
.
|
||||
.
|
||||
.SH OPTIONS
|
||||
.rs
|
||||
.TP 10
|
||||
\fB--prefix\fP
|
||||
Writes the directory prefix used in the PCRE installation for architecture
|
||||
independent files (\fI/usr\fP on many systems, \fI/usr/local\fP on some
|
||||
systems) to the standard output.
|
||||
.TP 10
|
||||
\fB--exec-prefix\fP
|
||||
Writes the directory prefix used in the PCRE installation for architecture
|
||||
dependent files (normally the same as \fB--prefix\fP) to the standard output.
|
||||
.TP 10
|
||||
\fB--version\fP
|
||||
Writes the version number of the installed PCRE libraries to the standard
|
||||
output.
|
||||
.TP 10
|
||||
\fB--libs\fP
|
||||
Writes to the standard output the command line options required to link
|
||||
with PCRE (\fB-lpcre\fP on many systems).
|
||||
.TP 10
|
||||
\fB--libs-posix\fP
|
||||
Writes to the standard output the command line options required to link with
|
||||
the PCRE posix emulation library (\fB-lpcreposix\fP \fB-lpcre\fP on many
|
||||
systems).
|
||||
.TP 10
|
||||
\fB--cflags\fP
|
||||
Writes to the standard output the command line options required to compile
|
||||
files that use PCRE (this may include some \fB-I\fP options, but is blank on
|
||||
many systems).
|
||||
.TP 10
|
||||
\fB--cflags-posix\fP
|
||||
Writes to the standard output the command line options required to compile
|
||||
files that use the PCRE posix emulation library (this may include some \fB-I\fP
|
||||
options, but is blank on many systems).
|
||||
.
|
||||
.
|
||||
.SH "SEE ALSO"
|
||||
.rs
|
||||
.sp
|
||||
\fBpcre(3)\fP
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
This manual page was originally written by Mark Baker for the Debian GNU/Linux
|
||||
system. It has been slightly revised as a generic PCRE man page.
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 18 April 2007
|
||||
.fi
|
||||
@@ -0,0 +1,67 @@
|
||||
PCRE-CONFIG(1) PCRE-CONFIG(1)
|
||||
|
||||
|
||||
|
||||
NAME
|
||||
pcre-config - program to return PCRE configuration
|
||||
|
||||
SYNOPSIS
|
||||
|
||||
pcre-config [--prefix] [--exec-prefix] [--version] [--libs]
|
||||
[--libs-posix] [--cflags] [--cflags-posix]
|
||||
|
||||
|
||||
DESCRIPTION
|
||||
|
||||
pcre-config returns the configuration of the installed PCRE libraries
|
||||
and the options required to compile a program to use them.
|
||||
|
||||
|
||||
OPTIONS
|
||||
|
||||
--prefix Writes the directory prefix used in the PCRE installation for
|
||||
architecture independent files (/usr on many systems,
|
||||
/usr/local on some systems) to the standard output.
|
||||
|
||||
--exec-prefix
|
||||
Writes the directory prefix used in the PCRE installation for
|
||||
architecture dependent files (normally the same as --prefix)
|
||||
to the standard output.
|
||||
|
||||
--version Writes the version number of the installed PCRE libraries to
|
||||
the standard output.
|
||||
|
||||
--libs Writes to the standard output the command line options
|
||||
required to link with PCRE (-lpcre on many systems).
|
||||
|
||||
--libs-posix
|
||||
Writes to the standard output the command line options
|
||||
required to link with the PCRE posix emulation library
|
||||
(-lpcreposix -lpcre on many systems).
|
||||
|
||||
--cflags Writes to the standard output the command line options
|
||||
required to compile files that use PCRE (this may include
|
||||
some -I options, but is blank on many systems).
|
||||
|
||||
--cflags-posix
|
||||
Writes to the standard output the command line options
|
||||
required to compile files that use the PCRE posix emulation
|
||||
library (this may include some -I options, but is blank on
|
||||
many systems).
|
||||
|
||||
|
||||
SEE ALSO
|
||||
|
||||
pcre(3)
|
||||
|
||||
|
||||
AUTHOR
|
||||
|
||||
This manual page was originally written by Mark Baker for the Debian
|
||||
GNU/Linux system. It has been slightly revised as a generic PCRE man
|
||||
page.
|
||||
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 18 April 2007
|
||||
+101
-49
@@ -6,12 +6,18 @@ PCRE - Perl-compatible regular expressions
|
||||
.sp
|
||||
The PCRE library is a set of functions that implement regular expression
|
||||
pattern matching using the same syntax and semantics as Perl, with just a few
|
||||
differences. The current implementation of PCRE (release 6.x) corresponds
|
||||
approximately with Perl 5.8, including support for UTF-8 encoded strings and
|
||||
Unicode general category properties. However, this support has to be explicitly
|
||||
enabled; it is not the default.
|
||||
differences. Certain features that appeared in Python and PCRE before they
|
||||
appeared in Perl are also available using the Python syntax. There is also some
|
||||
support for certain .NET and Oniguruma syntax items, and there is an option for
|
||||
requesting some minor changes that give better JavaScript compatibility.
|
||||
.P
|
||||
In addition to the Perl-compatible matching function, PCRE also contains an
|
||||
The current implementation of PCRE (release 7.x) corresponds approximately with
|
||||
Perl 5.10, including support for UTF-8 encoded strings and Unicode general
|
||||
category properties. However, UTF-8 and Unicode support has to be explicitly
|
||||
enabled; it is not the default. The Unicode tables correspond to Unicode
|
||||
release 5.1.
|
||||
.P
|
||||
In addition to the Perl-compatible matching function, PCRE contains an
|
||||
alternative matching function that matches the same compiled patterns in a
|
||||
different way. In certain circumstances, the alternative function has some
|
||||
advantages. For a discussion of the two matching algorithms, see the
|
||||
@@ -43,7 +49,11 @@ and
|
||||
.\" HREF
|
||||
\fBpcrecompat\fR
|
||||
.\"
|
||||
pages.
|
||||
pages. There is a syntax summary in the
|
||||
.\" HREF
|
||||
\fBpcresyntax\fR
|
||||
.\"
|
||||
page.
|
||||
.P
|
||||
Some features of PCRE can be included, excluded, or changed when the library is
|
||||
built. The
|
||||
@@ -77,6 +87,7 @@ all the sections are concatenated, for ease of searching. The sections are as
|
||||
follows:
|
||||
.sp
|
||||
pcre this document
|
||||
pcre-config show PCRE installation configuration information
|
||||
pcreapi details of PCRE's native C API
|
||||
pcrebuild options for building PCRE
|
||||
pcrecallout details of the callout feature
|
||||
@@ -88,6 +99,7 @@ follows:
|
||||
.\" JOIN
|
||||
pcrepattern syntax and semantics of supported
|
||||
regular expressions
|
||||
pcresyntax quick syntax reference
|
||||
pcreperform discussion of performance issues
|
||||
pcreposix the POSIX-compatible C API
|
||||
pcreprecompile details of saving and re-using precompiled patterns
|
||||
@@ -114,18 +126,15 @@ distribution and the
|
||||
\fBpcrebuild\fP
|
||||
.\"
|
||||
documentation for details). In these cases the limit is substantially larger.
|
||||
However, the speed of execution will be slower.
|
||||
However, the speed of execution is slower.
|
||||
.P
|
||||
All values in repeating quantifiers must be less than 65536. The maximum
|
||||
compiled length of subpattern with an explicit repeat count is 30000 bytes. The
|
||||
maximum number of capturing subpatterns is 65535.
|
||||
All values in repeating quantifiers must be less than 65536.
|
||||
.P
|
||||
There is no limit to the number of non-capturing subpatterns, but the maximum
|
||||
depth of nesting of all kinds of parenthesized subpattern, including capturing
|
||||
subpatterns, assertions, and other types of subpattern, is 200.
|
||||
There is no limit to the number of parenthesized subpatterns, but there can be
|
||||
no more than 65535 capturing subpatterns.
|
||||
.P
|
||||
The maximum length of name for a named subpattern is 32, and the maximum number
|
||||
of named subpatterns is 10000.
|
||||
The maximum length of name for a named subpattern is 32 characters, and the
|
||||
maximum number of named subpatterns is 10000.
|
||||
.P
|
||||
The maximum length of a subject string is the largest positive number that an
|
||||
integer variable can hold. However, when using the traditional matching
|
||||
@@ -137,7 +146,7 @@ issues, see the
|
||||
\fBpcrestack\fP
|
||||
.\"
|
||||
documentation.
|
||||
.sp
|
||||
.
|
||||
.\" HTML <a name="utf8support"></a>
|
||||
.
|
||||
.
|
||||
@@ -154,13 +163,14 @@ the code, and, in addition, you must call
|
||||
.\" HREF
|
||||
\fBpcre_compile()\fP
|
||||
.\"
|
||||
with the PCRE_UTF8 option flag. When you do this, both the pattern and any
|
||||
subject strings that are matched against it are treated as UTF-8 strings
|
||||
instead of just strings of bytes.
|
||||
with the PCRE_UTF8 option flag, or the pattern must start with the sequence
|
||||
(*UTF8). When either of these is the case, both the pattern and any subject
|
||||
strings that are matched against it are treated as UTF-8 strings instead of
|
||||
just strings of bytes.
|
||||
.P
|
||||
If you compile PCRE with UTF-8 support, but do not use it at run time, the
|
||||
library will be a bit bigger, but the additional run time overhead is limited
|
||||
to testing the PCRE_UTF8 flag in several places, so should not be very large.
|
||||
to testing the PCRE_UTF8 flag occasionally, so should not be very big.
|
||||
.P
|
||||
If PCRE is built with Unicode character property support (which implies UTF-8
|
||||
support), the escape sequences \ep{..}, \eP{..}, and \eX are supported.
|
||||
@@ -175,46 +185,83 @@ documentation. Only the short names for properties are supported. For example,
|
||||
\ep{L} matches a letter. Its Perl synonym, \ep{Letter}, is not supported.
|
||||
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
|
||||
compatibility with Perl 5.6. PCRE does not support this.
|
||||
.
|
||||
.\" HTML <a name="utf8strings"></a>
|
||||
.
|
||||
.SS "Validity of UTF-8 strings"
|
||||
.rs
|
||||
.sp
|
||||
When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
|
||||
are (by default) checked for validity on entry to the relevant functions. From
|
||||
release 7.3 of PCRE, the check is according the rules of RFC 3629, which are
|
||||
themselves derived from the Unicode specification. Earlier releases of PCRE
|
||||
followed the rules of RFC 2279, which allows the full range of 31-bit values (0
|
||||
to 0x7FFFFFFF). The current check allows only values in the range U+0 to
|
||||
U+10FFFF, excluding U+D800 to U+DFFF.
|
||||
.P
|
||||
The following comments apply when PCRE is running in UTF-8 mode:
|
||||
The excluded code points are the "Low Surrogate Area" of Unicode, of which the
|
||||
Unicode Standard says this: "The Low Surrogate Area does not contain any
|
||||
character assignments, consequently no character code charts or namelists are
|
||||
provided for this area. Surrogates are reserved for use with UTF-16 and then
|
||||
must be used in pairs." The code points that are encoded by UTF-16 pairs are
|
||||
available as independent code points in the UTF-8 encoding. (In other words,
|
||||
the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up
|
||||
UTF-8.)
|
||||
.P
|
||||
1. When you set the PCRE_UTF8 flag, the strings passed as patterns and subjects
|
||||
are checked for validity on entry to the relevant functions. If an invalid
|
||||
UTF-8 string is passed, an error return is given. In some situations, you may
|
||||
already know that your strings are valid, and therefore want to skip these
|
||||
checks in order to improve performance. If you set the PCRE_NO_UTF8_CHECK flag
|
||||
at compile time or at run time, PCRE assumes that the pattern or subject it
|
||||
is given (respectively) contains only valid UTF-8 codes. In this case, it does
|
||||
not diagnose an invalid UTF-8 string. If you pass an invalid UTF-8 string to
|
||||
PCRE when PCRE_NO_UTF8_CHECK is set, the results are undefined. Your program
|
||||
may crash.
|
||||
If an invalid UTF-8 string is passed to PCRE, an error return
|
||||
(PCRE_ERROR_BADUTF8) is given. In some situations, you may already know that
|
||||
your strings are valid, and therefore want to skip these checks in order to
|
||||
improve performance. If you set the PCRE_NO_UTF8_CHECK flag at compile time or
|
||||
at run time, PCRE assumes that the pattern or subject it is given
|
||||
(respectively) contains only valid UTF-8 codes. In this case, it does not
|
||||
diagnose an invalid UTF-8 string.
|
||||
.P
|
||||
2. An unbraced hexadecimal escape sequence (such as \exb3) matches a two-byte
|
||||
If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set, what
|
||||
happens depends on why the string is invalid. If the string conforms to the
|
||||
"old" definition of UTF-8 (RFC 2279), it is processed as a string of characters
|
||||
in the range 0 to 0x7FFFFFFF. In other words, apart from the initial validity
|
||||
test, PCRE (when in UTF-8 mode) handles strings according to the more liberal
|
||||
rules of RFC 2279. However, if the string does not even conform to RFC 2279,
|
||||
the result is undefined. Your program may crash.
|
||||
.P
|
||||
If you want to process strings of values in the full range 0 to 0x7FFFFFFF,
|
||||
encoded in a UTF-8-like manner as per the old RFC, you can set
|
||||
PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in this
|
||||
situation, you will have to apply your own validity check.
|
||||
.
|
||||
.SS "General comments about UTF-8 mode"
|
||||
.rs
|
||||
.sp
|
||||
1. An unbraced hexadecimal escape sequence (such as \exb3) matches a two-byte
|
||||
UTF-8 character if the value is greater than 127.
|
||||
.P
|
||||
3. Octal numbers up to \e777 are recognized, and match two-byte UTF-8
|
||||
2. Octal numbers up to \e777 are recognized, and match two-byte UTF-8
|
||||
characters for values greater than \e177.
|
||||
.P
|
||||
4. Repeat quantifiers apply to complete UTF-8 characters, not to individual
|
||||
3. Repeat quantifiers apply to complete UTF-8 characters, not to individual
|
||||
bytes, for example: \ex{100}{3}.
|
||||
.P
|
||||
5. The dot metacharacter matches one UTF-8 character instead of a single byte.
|
||||
4. The dot metacharacter matches one UTF-8 character instead of a single byte.
|
||||
.P
|
||||
6. The escape sequence \eC can be used to match a single byte in UTF-8 mode,
|
||||
5. The escape sequence \eC can be used to match a single byte in UTF-8 mode,
|
||||
but its use can lead to some strange effects. This facility is not available in
|
||||
the alternative matching function, \fBpcre_dfa_exec()\fP.
|
||||
.P
|
||||
7. The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly
|
||||
6. The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly
|
||||
test characters of any code value, but the characters that PCRE recognizes as
|
||||
digits, spaces, or word characters remain the same set as before, all with
|
||||
values less than 256. This remains true even when PCRE includes Unicode
|
||||
property support, because to do otherwise would slow down PCRE in many common
|
||||
cases. If you really want to test for a wider sense of, say, "digit", you
|
||||
must use Unicode property tests such as \ep{Nd}.
|
||||
must use Unicode property tests such as \ep{Nd}. Note that this also applies to
|
||||
\eb, because it is defined in terms of \ew and \eW.
|
||||
.P
|
||||
8. Similarly, characters that match the POSIX named character classes are all
|
||||
7. Similarly, characters that match the POSIX named character classes are all
|
||||
low-valued characters.
|
||||
.P
|
||||
8. However, the Perl 5.10 horizontal and vertical whitespace matching escapes
|
||||
(\eh, \eH, \ev, and \eV) do match all the appropriate Unicode characters.
|
||||
.P
|
||||
9. Case-insensitive matching applies only to characters whose values are less
|
||||
than 128, unless PCRE is built with Unicode property support. Even when Unicode
|
||||
property support is available, PCRE still uses its own character tables when
|
||||
@@ -225,20 +272,25 @@ case-insensitive matching only when there is a one-to-one mapping between a
|
||||
letter's cases. There are a small number of many-to-one mappings in Unicode;
|
||||
these are not supported by PCRE.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
.br
|
||||
University Computing Service,
|
||||
.br
|
||||
Cambridge CB2 3QG, England.
|
||||
University Computing Service
|
||||
Cambridge CB2 3QH, England.
|
||||
.fi
|
||||
.P
|
||||
Putting an actual email address here seems to have been a spam magnet, so I've
|
||||
taken it away. If you want to email me, use my initial and surname, separated
|
||||
by a dot, at the domain ucs.cam.ac.uk.
|
||||
taken it away. If you want to email me, use my two initials, followed by the
|
||||
two digits 10, at the domain cam.ac.uk.
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.in 0
|
||||
Last updated: 05 June 2006
|
||||
.br
|
||||
Copyright (c) 1997-2006 University of Cambridge.
|
||||
.nf
|
||||
Last updated: 11 April 2009
|
||||
Copyright (c) 1997-2009 University of Cambridge.
|
||||
.fi
|
||||
|
||||
+2770
-1243
File diff suppressed because it is too large
Load Diff
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B pcre *pcre_compile(const char *\fIpattern\fP, int \fIoptions\fP,
|
||||
.ti +5n
|
||||
.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
|
||||
@@ -17,8 +16,9 @@ PCRE - Perl-compatible regular expressions
|
||||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
This function compiles a regular expression into an internal form. Its
|
||||
arguments are:
|
||||
This function compiles a regular expression into an internal form. It is the
|
||||
same as \fBpcre_compile2()\fP, except for the absence of the \fIerrorcodeptr\fP
|
||||
argument. Its arguments are:
|
||||
.sp
|
||||
\fIpattern\fR A zero-terminated string containing the
|
||||
regular expression to be compiled
|
||||
@@ -30,33 +30,41 @@ arguments are:
|
||||
.sp
|
||||
The option bits are:
|
||||
.sp
|
||||
PCRE_ANCHORED Force pattern anchoring
|
||||
PCRE_AUTO_CALLOUT Compile automatic callouts
|
||||
PCRE_CASELESS Do caseless matching
|
||||
PCRE_DOLLAR_ENDONLY $ not to match newline at end
|
||||
PCRE_DOTALL . matches anything including NL
|
||||
PCRE_DUPNAMES Allow duplicate names for subpatterns
|
||||
PCRE_EXTENDED Ignore whitespace and # comments
|
||||
PCRE_EXTRA PCRE extra features
|
||||
(not much use currently)
|
||||
PCRE_FIRSTLINE Force matching to be before newline
|
||||
PCRE_MULTILINE ^ and $ match newlines within data
|
||||
PCRE_NEWLINE_CR Set CR as the newline sequence
|
||||
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
|
||||
PCRE_NEWLINE_LF Set LF as the newline sequence
|
||||
PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
|
||||
theses (named ones available)
|
||||
PCRE_UNGREEDY Invert greediness of quantifiers
|
||||
PCRE_UTF8 Run in UTF-8 mode
|
||||
PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
|
||||
validity (only relevant if
|
||||
PCRE_UTF8 is set)
|
||||
PCRE_ANCHORED Force pattern anchoring
|
||||
PCRE_AUTO_CALLOUT Compile automatic callouts
|
||||
PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
|
||||
PCRE_BSR_UNICODE \eR matches all Unicode line endings
|
||||
PCRE_CASELESS Do caseless matching
|
||||
PCRE_DOLLAR_ENDONLY $ not to match newline at end
|
||||
PCRE_DOTALL . matches anything including NL
|
||||
PCRE_DUPNAMES Allow duplicate names for subpatterns
|
||||
PCRE_EXTENDED Ignore whitespace and # comments
|
||||
PCRE_EXTRA PCRE extra features
|
||||
(not much use currently)
|
||||
PCRE_FIRSTLINE Force matching to be before newline
|
||||
PCRE_JAVASCRIPT_COMPAT JavaScript compatibility
|
||||
PCRE_MULTILINE ^ and $ match newlines within data
|
||||
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
|
||||
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline
|
||||
sequences
|
||||
PCRE_NEWLINE_CR Set CR as the newline sequence
|
||||
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
|
||||
PCRE_NEWLINE_LF Set LF as the newline sequence
|
||||
PCRE_NO_AUTO_CAPTURE Disable numbered capturing paren-
|
||||
theses (named ones available)
|
||||
PCRE_UNGREEDY Invert greediness of quantifiers
|
||||
PCRE_UTF8 Run in UTF-8 mode
|
||||
PCRE_NO_UTF8_CHECK Do not check the pattern for UTF-8
|
||||
validity (only relevant if
|
||||
PCRE_UTF8 is set)
|
||||
.sp
|
||||
PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
|
||||
PCRE_NO_UTF8_CHECK.
|
||||
.P
|
||||
The yield of the function is a pointer to a private data structure that
|
||||
contains the compiled pattern, or NULL if an error was detected.
|
||||
contains the compiled pattern, or NULL if an error was detected. Note that
|
||||
compiling regular expressions with one version of PCRE for use with a different
|
||||
version is not guaranteed to work and may cause crashes.
|
||||
.P
|
||||
There is a complete description of the PCRE native API in the
|
||||
.\" HREF
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B pcre *pcre_compile2(const char *\fIpattern\fP, int \fIoptions\fP,
|
||||
.ti +5n
|
||||
.B int *\fIerrorcodeptr\fP,
|
||||
@@ -46,6 +45,8 @@ The option bits are:
|
||||
(not much use currently)
|
||||
PCRE_FIRSTLINE Force matching to be before newline
|
||||
PCRE_MULTILINE ^ and $ match newlines within data
|
||||
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
|
||||
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
|
||||
PCRE_NEWLINE_CR Set CR as the newline sequence
|
||||
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
|
||||
PCRE_NEWLINE_LF Set LF as the newline sequence
|
||||
@@ -61,7 +62,9 @@ PCRE must be built with UTF-8 support in order to use PCRE_UTF8 and
|
||||
PCRE_NO_UTF8_CHECK.
|
||||
.P
|
||||
The yield of the function is a pointer to a private data structure that
|
||||
contains the compiled pattern, or NULL if an error was detected.
|
||||
contains the compiled pattern, or NULL if an error was detected. Note that
|
||||
compiling regular expressions with one version of PCRE for use with a different
|
||||
version is not guaranteed to work and may cause crashes.
|
||||
.P
|
||||
There is a complete description of the PCRE native API in the
|
||||
.\" HREF
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B int pcre_config(int \fIwhat\fP, void *\fIwhere\fP);
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
@@ -26,7 +25,15 @@ The available codes are:
|
||||
PCRE_CONFIG_MATCH_LIMIT Internal resource limit
|
||||
PCRE_CONFIG_MATCH_LIMIT_RECURSION
|
||||
Internal recursion depth limit
|
||||
PCRE_CONFIG_NEWLINE Value of the newline sequence
|
||||
PCRE_CONFIG_NEWLINE Value of the default newline sequence:
|
||||
13 (0x000d) for CR
|
||||
10 (0x000a) for LF
|
||||
3338 (0x0d0a) for CRLF
|
||||
-2 for ANYCRLF
|
||||
-1 for ANY
|
||||
PCRE_CONFIG_BSR Indicates what \eR matches by default:
|
||||
0 all Unicode line endings
|
||||
1 CR, LF, or CRLF only
|
||||
PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
|
||||
Threshold of return slots, above
|
||||
which \fBmalloc()\fR is used by
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B int pcre_copy_named_substring(const pcre *\fIcode\fP,
|
||||
.ti +5n
|
||||
.B const char *\fIsubject\fP, int *\fIovector\fP,
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B int pcre_copy_substring(const char *\fIsubject\fP, int *\fIovector\fP,
|
||||
.ti +5n
|
||||
.B int \fIstringcount\fP, int \fIstringnumber\fP, char *\fIbuffer\fP,
|
||||
@@ -27,7 +26,7 @@ buffer. The arguments are:
|
||||
\fIbuffer\fP Buffer to receive the string
|
||||
\fIbuffersize\fP Size of buffer
|
||||
.sp
|
||||
The yield is the legnth of the string, PCRE_ERROR_NOMEMORY if the buffer was
|
||||
The yield is the length of the string, PCRE_ERROR_NOMEMORY if the buffer was
|
||||
too small, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
|
||||
.P
|
||||
There is a complete description of the PCRE native API in the
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B int pcre_dfa_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
|
||||
.ti +5n
|
||||
.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
|
||||
@@ -20,9 +19,9 @@ PCRE - Perl-compatible regular expressions
|
||||
.rs
|
||||
.sp
|
||||
This function matches a compiled regular expression against a given subject
|
||||
string, using a DFA matching algorithm (\fInot\fP Perl-compatible). Note that
|
||||
the main, Perl-compatible, matching function is \fBpcre_exec()\fP. The
|
||||
arguments for this function are:
|
||||
string, using an alternative matching algorithm that scans the subject string
|
||||
just once (\fInot\fP Perl-compatible). Note that the main, Perl-compatible,
|
||||
matching function is \fBpcre_exec()\fP. The arguments for this function are:
|
||||
.sp
|
||||
\fIcode\fP Points to the compiled pattern
|
||||
\fIextra\fP Points to an associated \fBpcre_extra\fP structure,
|
||||
@@ -40,12 +39,17 @@ arguments for this function are:
|
||||
The options are:
|
||||
.sp
|
||||
PCRE_ANCHORED Match only at the first position
|
||||
PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
|
||||
PCRE_BSR_UNICODE \eR matches all Unicode line endings
|
||||
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
|
||||
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
|
||||
PCRE_NEWLINE_CR Set CR as the newline sequence
|
||||
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
|
||||
PCRE_NEWLINE_LF Set LF as the newline sequence
|
||||
PCRE_NOTBOL Subject is not the beginning of a line
|
||||
PCRE_NOTEOL Subject is not the end of a line
|
||||
PCRE_NOTEMPTY An empty string is not a valid match
|
||||
PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
|
||||
PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
|
||||
validity (only relevant if PCRE_UTF8
|
||||
was set at compile time)
|
||||
@@ -53,8 +57,8 @@ The options are:
|
||||
PCRE_DFA_SHORTEST Return only the shortest match
|
||||
PCRE_DFA_RESTART This is a restart after a partial match
|
||||
.sp
|
||||
There are restrictions on what may appear in a pattern when matching using the
|
||||
DFA algorithm is requested. Details are given in the
|
||||
There are restrictions on what may appear in a pattern when using this matching
|
||||
function. Details are given in the
|
||||
.\" HREF
|
||||
\fBpcrematching\fP
|
||||
.\"
|
||||
@@ -71,7 +75,7 @@ A \fBpcre_extra\fP structure contains the following fields:
|
||||
.sp
|
||||
The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
|
||||
PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
|
||||
PCRE_EXTRA_TABLES. For DFA matching, the \fImatch_limit\fP and
|
||||
PCRE_EXTRA_TABLES. For this matching function, the \fImatch_limit\fP and
|
||||
\fImatch_limit_recursion\fP fields are not used, and must not be set.
|
||||
.P
|
||||
There is a complete description of the PCRE native API in the
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B int pcre_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
|
||||
.ti +5n
|
||||
.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
|
||||
@@ -35,19 +34,28 @@ offsets to captured substrings. Its arguments are:
|
||||
The options are:
|
||||
.sp
|
||||
PCRE_ANCHORED Match only at the first position
|
||||
PCRE_BSR_ANYCRLF \eR matches only CR, LF, or CRLF
|
||||
PCRE_BSR_UNICODE \eR matches all Unicode line endings
|
||||
PCRE_NEWLINE_ANY Recognize any Unicode newline sequence
|
||||
PCRE_NEWLINE_ANYCRLF Recognize CR, LF, and CRLF as newline sequences
|
||||
PCRE_NEWLINE_CR Set CR as the newline sequence
|
||||
PCRE_NEWLINE_CRLF Set CRLF as the newline sequence
|
||||
PCRE_NEWLINE_LF Set LF as the newline sequence
|
||||
PCRE_NOTBOL Subject is not the beginning of a line
|
||||
PCRE_NOTEOL Subject is not the end of a line
|
||||
PCRE_NOTEMPTY An empty string is not a valid match
|
||||
PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
|
||||
PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8
|
||||
validity (only relevant if PCRE_UTF8
|
||||
was set at compile time)
|
||||
PCRE_PARTIAL Return PCRE_ERROR_PARTIAL for a partial match
|
||||
.sp
|
||||
There are restrictions on what may appear in a pattern when partial matching is
|
||||
requested.
|
||||
requested. For details, see the
|
||||
.\" HREF
|
||||
\fBpcrepartial\fP
|
||||
.\"
|
||||
page.
|
||||
.P
|
||||
A \fBpcre_extra\fP structure contains the following fields:
|
||||
.sp
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B void pcre_free_substring(const char *\fIstringptr\fP);
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B void pcre_free_substring_list(const char **\fIstringptr\fP);
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B int pcre_fullinfo(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
|
||||
.ti +5n
|
||||
.B int \fIwhat\fP, void *\fIwhere\fP);
|
||||
@@ -31,13 +30,14 @@ The following information is available:
|
||||
-1 for start of string
|
||||
or after newline, or
|
||||
-2 otherwise
|
||||
PCRE_INFO_FIRSTTABLE Table of first bytes
|
||||
(after studying)
|
||||
PCRE_INFO_FIRSTTABLE Table of first bytes (after studying)
|
||||
PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
|
||||
PCRE_INFO_LASTLITERAL Literal last byte required
|
||||
PCRE_INFO_NAMECOUNT Number of named subpatterns
|
||||
PCRE_INFO_NAMEENTRYSIZE Size of name table entry
|
||||
PCRE_INFO_NAMETABLE Pointer to name table
|
||||
PCRE_INFO_OPTIONS Options used for compilation
|
||||
PCRE_INFO_OKPARTIAL Return 1 if partial matching can be tried
|
||||
PCRE_INFO_OPTIONS Option bits used for compilation
|
||||
PCRE_INFO_SIZE Size of compiled pattern
|
||||
PCRE_INFO_STUDYSIZE Size of study data
|
||||
.sp
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B int pcre_get_named_substring(const pcre *\fIcode\fP,
|
||||
.ti +5n
|
||||
.B const char *\fIsubject\fP, int *\fIovector\fP,
|
||||
@@ -30,9 +29,10 @@ arguments are:
|
||||
\fIstringptr\fP Where to put the string pointer
|
||||
.sp
|
||||
The memory in which the substring is placed is obtained by calling
|
||||
\fBpcre_malloc()\fP. The yield of the function is the length of the extracted
|
||||
substring, PCRE_ERROR_NOMEMORY if sufficient memory could not be obtained, or
|
||||
PCRE_ERROR_NOSUBSTRING if the string name is invalid.
|
||||
\fBpcre_malloc()\fP. The convenience function \fBpcre_free_substring()\fP can
|
||||
be used to free it when it is no longer needed. The yield of the function is
|
||||
the length of the extracted substring, PCRE_ERROR_NOMEMORY if sufficient memory
|
||||
could not be obtained, or PCRE_ERROR_NOSUBSTRING if the string name is invalid.
|
||||
.P
|
||||
There is a complete description of the PCRE native API in the
|
||||
.\" HREF
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B int pcre_get_stringnumber(const pcre *\fIcode\fP,
|
||||
.ti +5n
|
||||
.B const char *\fIname\fP);
|
||||
@@ -22,7 +21,10 @@ parenthesis in a compiled pattern. Its arguments are:
|
||||
\fIname\fP Name whose number is required
|
||||
.sp
|
||||
The yield of the function is the number of the parenthesis if the name is
|
||||
found, or PCRE_ERROR_NOSUBSTRING otherwise.
|
||||
found, or PCRE_ERROR_NOSUBSTRING otherwise. When duplicate names are allowed
|
||||
(PCRE_DUPNAMES is set), it is not defined which of the numbers is returned by
|
||||
\fBpcre_get_stringnumber()\fP. You can obtain the complete list by calling
|
||||
\fBpcre_get_stringtable_entries()\fP.
|
||||
.P
|
||||
There is a complete description of the PCRE native API in the
|
||||
.\" HREF
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
.TH PCRE_GET_STRINGNUMBER 3
|
||||
.TH PCRE_GET_STRINGTABLE_ENTRIES 3
|
||||
.SH NAME
|
||||
PCRE - Perl-compatible regular expressions
|
||||
.SH SYNOPSIS
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B int pcre_get_stringtable_entries(const pcre *\fIcode\fP,
|
||||
.ti +5n
|
||||
.B const char *\fIname\fP, char **\fIfirst\fP, char **\fIlast\fP);
|
||||
@@ -34,7 +33,7 @@ the table entries, in the
|
||||
.\" HREF
|
||||
\fBpcreapi\fP
|
||||
.\"
|
||||
page and a description of the POSIX API in the
|
||||
page, and a description of the POSIX API in the
|
||||
.\" HREF
|
||||
\fBpcreposix\fP
|
||||
.\"
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B int pcre_get_substring(const char *\fIsubject\fP, int *\fIovector\fP,
|
||||
.ti +5n
|
||||
.B int \fIstringcount\fP, int \fIstringnumber\fP,
|
||||
@@ -27,9 +26,10 @@ arguments are:
|
||||
\fIstringptr\fP Where to put the string pointer
|
||||
.sp
|
||||
The memory in which the substring is placed is obtained by calling
|
||||
\fBpcre_malloc()\fP. The yield of the function is the length of the substring,
|
||||
PCRE_ERROR_NOMEMORY if sufficient memory could not be obtained, or
|
||||
PCRE_ERROR_NOSUBSTRING if the string number is invalid.
|
||||
\fBpcre_malloc()\fP. The convenience function \fBpcre_free_substring()\fP can
|
||||
be used to free it when it is no longer needed. The yield of the function is
|
||||
the length of the substring, PCRE_ERROR_NOMEMORY if sufficient memory could not
|
||||
be obtained, or PCRE_ERROR_NOSUBSTRING if the string number is invalid.
|
||||
.P
|
||||
There is a complete description of the PCRE native API in the
|
||||
.\" HREF
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B int pcre_get_substring_list(const char *\fIsubject\fP,
|
||||
.ti +5n
|
||||
.B int *\fIovector\fP, int \fIstringcount\fP, "const char ***\fIlistptr\fP);"
|
||||
@@ -24,10 +23,12 @@ substrings. The arguments are:
|
||||
\fIlistptr\fP Where to put a pointer to the list
|
||||
.sp
|
||||
The memory in which the substrings and the list are placed is obtained by
|
||||
calling \fBpcre_malloc()\fP. A pointer to a list of pointers is put in
|
||||
the variable whose address is in \fIlistptr\fP. The list is terminated by a
|
||||
NULL pointer. The yield of the function is zero on success or
|
||||
PCRE_ERROR_NOMEMORY if sufficient memory could not be obtained.
|
||||
calling \fBpcre_malloc()\fP. The convenience function
|
||||
\fBpcre_free_substring_list()\fP can be used to free it when it is no longer
|
||||
needed. A pointer to a list of pointers is put in the variable whose address is
|
||||
in \fIlistptr\fP. The list is terminated by a NULL pointer. The yield of the
|
||||
function is zero on success or PCRE_ERROR_NOMEMORY if sufficient memory could
|
||||
not be obtained.
|
||||
.P
|
||||
There is a complete description of the PCRE native API in the
|
||||
.\" HREF
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B int pcre_info(const pcre *\fIcode\fP, int *\fIoptptr\fP, int
|
||||
.B *\fIfirstcharptr\fP);
|
||||
.
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B const unsigned char *pcre_maketables(void);
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B int pcre_refcount(pcre *\fIcode\fP, int \fIadjust\fP);
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B pcre_extra *pcre_study(const pcre *\fIcode\fP, int \fIoptions\fP,
|
||||
.ti +5n
|
||||
.B const char **\fIerrptr\fP);
|
||||
|
||||
@@ -7,7 +7,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B char *pcre_version(void);
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
|
||||
+379
-167
@@ -7,14 +7,12 @@ PCRE - Perl-compatible regular expressions
|
||||
.B #include <pcre.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B pcre *pcre_compile(const char *\fIpattern\fP, int \fIoptions\fP,
|
||||
.ti +5n
|
||||
.B const char **\fIerrptr\fP, int *\fIerroffset\fP,
|
||||
.ti +5n
|
||||
.B const unsigned char *\fItableptr\fP);
|
||||
.PP
|
||||
.br
|
||||
.B pcre *pcre_compile2(const char *\fIpattern\fP, int \fIoptions\fP,
|
||||
.ti +5n
|
||||
.B int *\fIerrorcodeptr\fP,
|
||||
@@ -23,19 +21,16 @@ PCRE - Perl-compatible regular expressions
|
||||
.ti +5n
|
||||
.B const unsigned char *\fItableptr\fP);
|
||||
.PP
|
||||
.br
|
||||
.B pcre_extra *pcre_study(const pcre *\fIcode\fP, int \fIoptions\fP,
|
||||
.ti +5n
|
||||
.B const char **\fIerrptr\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
|
||||
.ti +5n
|
||||
.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
|
||||
.ti +5n
|
||||
.B int \fIoptions\fP, int *\fIovector\fP, int \fIovecsize\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_dfa_exec(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
|
||||
.ti +5n
|
||||
.B "const char *\fIsubject\fP," int \fIlength\fP, int \fIstartoffset\fP,
|
||||
@@ -44,7 +39,6 @@ PCRE - Perl-compatible regular expressions
|
||||
.ti +5n
|
||||
.B int *\fIworkspace\fP, int \fIwscount\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_copy_named_substring(const pcre *\fIcode\fP,
|
||||
.ti +5n
|
||||
.B const char *\fIsubject\fP, int *\fIovector\fP,
|
||||
@@ -53,14 +47,12 @@ PCRE - Perl-compatible regular expressions
|
||||
.ti +5n
|
||||
.B char *\fIbuffer\fP, int \fIbuffersize\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_copy_substring(const char *\fIsubject\fP, int *\fIovector\fP,
|
||||
.ti +5n
|
||||
.B int \fIstringcount\fP, int \fIstringnumber\fP, char *\fIbuffer\fP,
|
||||
.ti +5n
|
||||
.B int \fIbuffersize\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_get_named_substring(const pcre *\fIcode\fP,
|
||||
.ti +5n
|
||||
.B const char *\fIsubject\fP, int *\fIovector\fP,
|
||||
@@ -69,76 +61,59 @@ PCRE - Perl-compatible regular expressions
|
||||
.ti +5n
|
||||
.B const char **\fIstringptr\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_get_stringnumber(const pcre *\fIcode\fP,
|
||||
.ti +5n
|
||||
.B const char *\fIname\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_get_stringtable_entries(const pcre *\fIcode\fP,
|
||||
.ti +5n
|
||||
.B const char *\fIname\fP, char **\fIfirst\fP, char **\fIlast\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_get_substring(const char *\fIsubject\fP, int *\fIovector\fP,
|
||||
.ti +5n
|
||||
.B int \fIstringcount\fP, int \fIstringnumber\fP,
|
||||
.ti +5n
|
||||
.B const char **\fIstringptr\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_get_substring_list(const char *\fIsubject\fP,
|
||||
.ti +5n
|
||||
.B int *\fIovector\fP, int \fIstringcount\fP, "const char ***\fIlistptr\fP);"
|
||||
.PP
|
||||
.br
|
||||
.B void pcre_free_substring(const char *\fIstringptr\fP);
|
||||
.PP
|
||||
.br
|
||||
.B void pcre_free_substring_list(const char **\fIstringptr\fP);
|
||||
.PP
|
||||
.br
|
||||
.B const unsigned char *pcre_maketables(void);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_fullinfo(const pcre *\fIcode\fP, "const pcre_extra *\fIextra\fP,"
|
||||
.ti +5n
|
||||
.B int \fIwhat\fP, void *\fIwhere\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_info(const pcre *\fIcode\fP, int *\fIoptptr\fP, int
|
||||
.B *\fIfirstcharptr\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_refcount(pcre *\fIcode\fP, int \fIadjust\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_config(int \fIwhat\fP, void *\fIwhere\fP);
|
||||
.PP
|
||||
.br
|
||||
.B char *pcre_version(void);
|
||||
.PP
|
||||
.br
|
||||
.B void *(*pcre_malloc)(size_t);
|
||||
.PP
|
||||
.br
|
||||
.B void (*pcre_free)(void *);
|
||||
.PP
|
||||
.br
|
||||
.B void *(*pcre_stack_malloc)(size_t);
|
||||
.PP
|
||||
.br
|
||||
.B void (*pcre_stack_free)(void *);
|
||||
.PP
|
||||
.br
|
||||
.B int (*pcre_callout)(pcre_callout_block *);
|
||||
.
|
||||
.
|
||||
.SH "PCRE API OVERVIEW"
|
||||
.rs
|
||||
.sp
|
||||
PCRE has its own native API, which is described in this document. There is
|
||||
also a set of wrapper functions that correspond to the POSIX regular expression
|
||||
PCRE has its own native API, which is described in this document. There are
|
||||
also some wrapper functions that correspond to the POSIX regular expression
|
||||
API. These are described in the
|
||||
.\" HREF
|
||||
\fBpcreposix\fP
|
||||
@@ -165,14 +140,14 @@ distribution. The
|
||||
.\" HREF
|
||||
\fBpcresample\fP
|
||||
.\"
|
||||
documentation describes how to run it.
|
||||
documentation describes how to compile and run it.
|
||||
.P
|
||||
A second matching function, \fBpcre_dfa_exec()\fP, which is not
|
||||
Perl-compatible, is also provided. This uses a different algorithm for the
|
||||
matching. The alternative algorithm finds all possible matches (at a given
|
||||
point in the subject). However, this algorithm does not return captured
|
||||
substrings. A description of the two matching algorithms and their advantages
|
||||
and disadvantages is given in the
|
||||
point in the subject), and scans the subject just once. However, this algorithm
|
||||
does not return captured substrings. A description of the two matching
|
||||
algorithms and their advantages and disadvantages is given in the
|
||||
.\" HREF
|
||||
\fBpcrematching\fP
|
||||
.\"
|
||||
@@ -243,16 +218,47 @@ points during a matching operation. Details are given in the
|
||||
documentation.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="newlines"></a>
|
||||
.SH NEWLINES
|
||||
PCRE supports three different conventions for indicating line breaks in
|
||||
strings: a single CR character, a single LF character, or the two-character
|
||||
sequence CRLF. All three are used as "standard" by different operating systems.
|
||||
When PCRE is built, a default can be specified. The default default is LF,
|
||||
which is the Unix standard. When PCRE is run, the default can be overridden,
|
||||
either when a pattern is compiled, or when it is matched.
|
||||
.rs
|
||||
.sp
|
||||
PCRE supports five different conventions for indicating line breaks in
|
||||
strings: a single CR (carriage return) character, a single LF (linefeed)
|
||||
character, the two-character sequence CRLF, any of the three preceding, or any
|
||||
Unicode newline sequence. The Unicode newline sequences are the three just
|
||||
mentioned, plus the single characters VT (vertical tab, U+000B), FF (formfeed,
|
||||
U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
|
||||
(paragraph separator, U+2029).
|
||||
.P
|
||||
Each of the first three conventions is used by at least one operating system as
|
||||
its standard newline sequence. When PCRE is built, a default can be specified.
|
||||
The default default is LF, which is the Unix standard. When PCRE is run, the
|
||||
default can be overridden, either when a pattern is compiled, or when it is
|
||||
matched.
|
||||
.P
|
||||
At compile time, the newline convention can be specified by the \fIoptions\fP
|
||||
argument of \fBpcre_compile()\fP, or it can be specified by special text at the
|
||||
start of the pattern itself; this overrides any other settings. See the
|
||||
.\" HREF
|
||||
\fBpcrepattern\fP
|
||||
.\"
|
||||
page for details of the special character sequences.
|
||||
.P
|
||||
In the PCRE documentation the word "newline" is used to mean "the character or
|
||||
pair of characters that indicate a line break".
|
||||
pair of characters that indicate a line break". The choice of newline
|
||||
convention affects the handling of the dot, circumflex, and dollar
|
||||
metacharacters, the handling of #-comments in /x mode, and, when CRLF is a
|
||||
recognized line ending sequence, the match position advancement for a
|
||||
non-anchored pattern. There is more detail about this in the
|
||||
.\" HTML <a href="#execoptions">
|
||||
.\" </a>
|
||||
section on \fBpcre_exec()\fP options
|
||||
.\"
|
||||
below.
|
||||
.P
|
||||
The choice of newline convention does not affect the interpretation of
|
||||
the \en or \er escape sequences, nor does it affect what \eR matches, which is
|
||||
controlled in a similar way, but by separate options.
|
||||
.
|
||||
.
|
||||
.SH MULTITHREADING
|
||||
@@ -276,7 +282,9 @@ which it was compiled. Details are given in the
|
||||
.\" HREF
|
||||
\fBpcreprecompile\fP
|
||||
.\"
|
||||
documentation.
|
||||
documentation. However, compiling a regular expression with one version of PCRE
|
||||
for use with a different version is not guaranteed to work and may cause
|
||||
crashes.
|
||||
.
|
||||
.
|
||||
.SH "CHECKING BUILD-TIME OPTIONS"
|
||||
@@ -308,9 +316,18 @@ properties is available; otherwise it is set to zero.
|
||||
PCRE_CONFIG_NEWLINE
|
||||
.sp
|
||||
The output is an integer whose value specifies the default character sequence
|
||||
that is recognized as meaning "newline". The three values that are supported
|
||||
are: 10 for LF, 13 for CR, and 3338 for CRLF. The default should normally be
|
||||
the standard sequence for your operating system.
|
||||
that is recognized as meaning "newline". The four values that are supported
|
||||
are: 10 for LF, 13 for CR, 3338 for CRLF, -2 for ANYCRLF, and -1 for ANY.
|
||||
Though they are derived from ASCII, the same values are returned in EBCDIC
|
||||
environments. The default should normally correspond to the standard sequence
|
||||
for your operating system.
|
||||
.sp
|
||||
PCRE_CONFIG_BSR
|
||||
.sp
|
||||
The output is an integer whose value indicates what character sequences the \eR
|
||||
escape sequence matches by default. A value of 0 means that \eR matches any
|
||||
Unicode line ending sequence; a value of 1 means that \eR matches only CR, LF,
|
||||
or CRLF. The default can be overridden when a pattern is compiled or matched.
|
||||
.sp
|
||||
PCRE_CONFIG_LINK_SIZE
|
||||
.sp
|
||||
@@ -332,13 +349,13 @@ documentation.
|
||||
.sp
|
||||
PCRE_CONFIG_MATCH_LIMIT
|
||||
.sp
|
||||
The output is an integer that gives the default limit for the number of
|
||||
The output is a long integer that gives the default limit for the number of
|
||||
internal matching function calls in a \fBpcre_exec()\fP execution. Further
|
||||
details are given with \fBpcre_exec()\fP below.
|
||||
.sp
|
||||
PCRE_CONFIG_MATCH_LIMIT_RECURSION
|
||||
.sp
|
||||
The output is an integer that gives the default limit for the depth of
|
||||
The output is a long integer that gives the default limit for the depth of
|
||||
recursion when calling the internal matching function in a \fBpcre_exec()\fP
|
||||
execution. Further details are given with \fBpcre_exec()\fP below.
|
||||
.sp
|
||||
@@ -387,18 +404,19 @@ depend on memory location, the complete \fBpcre\fP data block is not
|
||||
fully relocatable, because it may contain a copy of the \fItableptr\fP
|
||||
argument, which is an address (see below).
|
||||
.P
|
||||
The \fIoptions\fP argument contains independent bits that affect the
|
||||
The \fIoptions\fP argument contains various bit settings that affect the
|
||||
compilation. It should be zero if no options are required. The available
|
||||
options are described below. Some of them, in particular, those that are
|
||||
compatible with Perl, can also be set and unset from within the pattern (see
|
||||
the detailed description in the
|
||||
options are described below. Some of them (in particular, those that are
|
||||
compatible with Perl, but also some others) can also be set and unset from
|
||||
within the pattern (see the detailed description in the
|
||||
.\" HREF
|
||||
\fBpcrepattern\fP
|
||||
.\"
|
||||
documentation). For these options, the contents of the \fIoptions\fP argument
|
||||
specifies their initial settings at the start of compilation and execution. The
|
||||
PCRE_ANCHORED and PCRE_NEWLINE_\fIxxx\fP options can be set at the time of
|
||||
matching as well as at compile time.
|
||||
documentation). For those options that can be different in different parts of
|
||||
the pattern, the contents of the \fIoptions\fP argument specifies their initial
|
||||
settings at the start of compilation and execution. The PCRE_ANCHORED and
|
||||
PCRE_NEWLINE_\fIxxx\fP options can be set at the time of matching as well as at
|
||||
compile time.
|
||||
.P
|
||||
If \fIerrptr\fP is NULL, \fBpcre_compile()\fP returns NULL immediately.
|
||||
Otherwise, if compilation of a pattern fails, \fBpcre_compile()\fP returns
|
||||
@@ -452,6 +470,15 @@ facility, see the
|
||||
\fBpcrecallout\fP
|
||||
.\"
|
||||
documentation.
|
||||
.sp
|
||||
PCRE_BSR_ANYCRLF
|
||||
PCRE_BSR_UNICODE
|
||||
.sp
|
||||
These options (which are mutually exclusive) control what the \eR escape
|
||||
sequence matches. The choice is either to match only CR, LF, or CRLF, or to
|
||||
match any Unicode newline sequence. The default is specified when PCRE is
|
||||
built. It can be overridden from within the pattern, or by setting an option
|
||||
when a compiled pattern is matched.
|
||||
.sp
|
||||
PCRE_CASELESS
|
||||
.sp
|
||||
@@ -480,8 +507,8 @@ If this bit is set, a dot metacharater in the pattern matches all characters,
|
||||
including those that indicate newline. Without it, a dot does not match when
|
||||
the current position is at a newline. This option is equivalent to Perl's /s
|
||||
option, and it can be changed within a pattern by a (?s) option setting. A
|
||||
negative class such as [^a] always matches newlines, independent of the setting
|
||||
of this option.
|
||||
negative class such as [^a] always matches newline characters, independent of
|
||||
the setting of this option.
|
||||
.sp
|
||||
PCRE_DUPNAMES
|
||||
.sp
|
||||
@@ -524,6 +551,20 @@ this option. It can also be set by a (?X) option setting within a pattern.
|
||||
If this option is set, an unanchored pattern is required to match before or at
|
||||
the first newline in the subject string, though the matched text may continue
|
||||
over the newline.
|
||||
.sp
|
||||
PCRE_JAVASCRIPT_COMPAT
|
||||
.sp
|
||||
If this option is set, PCRE's behaviour is changed in some ways so that it is
|
||||
compatible with JavaScript rather than Perl. The changes are as follows:
|
||||
.P
|
||||
(1) A lone closing square bracket in a pattern causes a compile-time error,
|
||||
because this is illegal in JavaScript (by default it is treated as a data
|
||||
character). Thus, the pattern AB]CD becomes illegal when this option is set.
|
||||
.P
|
||||
(2) At run time, a back reference to an unset subpattern group matches an empty
|
||||
string (by default this causes the current matching alternative to fail). A
|
||||
pattern such as (\e1)(a) succeeds when this option is set (assuming it can find
|
||||
an "a" in the subject), whereas it fails by default, for Perl compatibility.
|
||||
.sp
|
||||
PCRE_MULTILINE
|
||||
.sp
|
||||
@@ -544,18 +585,37 @@ occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect.
|
||||
PCRE_NEWLINE_CR
|
||||
PCRE_NEWLINE_LF
|
||||
PCRE_NEWLINE_CRLF
|
||||
PCRE_NEWLINE_ANYCRLF
|
||||
PCRE_NEWLINE_ANY
|
||||
.sp
|
||||
These options override the default newline definition that was chosen when PCRE
|
||||
was built. Setting the first or the second specifies that a newline is
|
||||
indicated by a single character (CR or LF, respectively). Setting both of them
|
||||
specifies that a newline is indicated by the two-character CRLF sequence. For
|
||||
convenience, PCRE_NEWLINE_CRLF is defined to contain both bits. The only time
|
||||
that a line break is relevant when compiling a pattern is if PCRE_EXTENDED is
|
||||
set, and an unescaped # outside a character class is encountered. This
|
||||
indicates a comment that lasts until after the next newline.
|
||||
indicated by a single character (CR or LF, respectively). Setting
|
||||
PCRE_NEWLINE_CRLF specifies that a newline is indicated by the two-character
|
||||
CRLF sequence. Setting PCRE_NEWLINE_ANYCRLF specifies that any of the three
|
||||
preceding sequences should be recognized. Setting PCRE_NEWLINE_ANY specifies
|
||||
that any Unicode newline sequence should be recognized. The Unicode newline
|
||||
sequences are the three just mentioned, plus the single characters VT (vertical
|
||||
tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
|
||||
separator, U+2028), and PS (paragraph separator, U+2029). The last two are
|
||||
recognized only in UTF-8 mode.
|
||||
.P
|
||||
The newline option set at compile time becomes the default that is used for
|
||||
\fBpcre_exec()\fP and \fBpcre_dfa_exec()\fP, but it can be overridden.
|
||||
The newline setting in the options word uses three bits that are treated
|
||||
as a number, giving eight possibilities. Currently only six are used (default
|
||||
plus the five values above). This means that if you set more than one newline
|
||||
option, the combination may or may not be sensible. For example,
|
||||
PCRE_NEWLINE_CR with PCRE_NEWLINE_LF is equivalent to PCRE_NEWLINE_CRLF, but
|
||||
other combinations may yield unused numbers and cause an error.
|
||||
.P
|
||||
The only time that a line break is specially recognized when compiling a
|
||||
pattern is if PCRE_EXTENDED is set, and an unescaped # outside a character
|
||||
class is encountered. This indicates a comment that lasts until after the next
|
||||
line break sequence. In other circumstances, line break sequences are treated
|
||||
as literal data, except that in PCRE_EXTENDED mode, both CR and LF are treated
|
||||
as whitespace characters and are therefore ignored.
|
||||
.P
|
||||
The newline option that is set at compile time becomes the default that is used
|
||||
for \fBpcre_exec()\fP and \fBpcre_dfa_exec()\fP, but it can be overridden.
|
||||
.sp
|
||||
PCRE_NO_AUTO_CAPTURE
|
||||
.sp
|
||||
@@ -591,14 +651,22 @@ page.
|
||||
PCRE_NO_UTF8_CHECK
|
||||
.sp
|
||||
When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is
|
||||
automatically checked. If an invalid UTF-8 sequence of bytes is found,
|
||||
\fBpcre_compile()\fP returns an error. If you already know that your pattern is
|
||||
valid, and you want to skip this check for performance reasons, you can set the
|
||||
PCRE_NO_UTF8_CHECK option. When it is set, the effect of passing an invalid
|
||||
UTF-8 string as a pattern is undefined. It may cause your program to crash.
|
||||
Note that this option can also be passed to \fBpcre_exec()\fP and
|
||||
\fBpcre_dfa_exec()\fP, to suppress the UTF-8 validity checking of subject
|
||||
strings.
|
||||
automatically checked. There is a discussion about the
|
||||
.\" HTML <a href="pcre.html#utf8strings">
|
||||
.\" </a>
|
||||
validity of UTF-8 strings
|
||||
.\"
|
||||
in the main
|
||||
.\" HREF
|
||||
\fBpcre\fP
|
||||
.\"
|
||||
page. If an invalid UTF-8 sequence of bytes is found, \fBpcre_compile()\fP
|
||||
returns an error. If you already know that your pattern is valid, and you want
|
||||
to skip this check for performance reasons, you can set the PCRE_NO_UTF8_CHECK
|
||||
option. When it is set, the effect of passing an invalid UTF-8 string as a
|
||||
pattern is undefined. It may cause your program to crash. Note that this option
|
||||
can also be passed to \fBpcre_exec()\fP and \fBpcre_dfa_exec()\fP, to suppress
|
||||
the UTF-8 validity checking of subject strings.
|
||||
.
|
||||
.
|
||||
.SH "COMPILATION ERROR CODES"
|
||||
@@ -606,7 +674,8 @@ strings.
|
||||
.sp
|
||||
The following table lists the error codes than may be returned by
|
||||
\fBpcre_compile2()\fP, along with the error messages that may be returned by
|
||||
both compiling functions.
|
||||
both compiling functions. As PCRE has developed, some error codes have fallen
|
||||
out of use. To avoid confusion, they have not been re-used.
|
||||
.sp
|
||||
0 no error
|
||||
1 \e at end of pattern
|
||||
@@ -618,17 +687,17 @@ both compiling functions.
|
||||
7 invalid escape sequence in character class
|
||||
8 range out of order in character class
|
||||
9 nothing to repeat
|
||||
10 operand of unlimited repeat could match the empty string
|
||||
10 [this code is not in use]
|
||||
11 internal error: unexpected repeat
|
||||
12 unrecognized character after (?
|
||||
12 unrecognized character after (? or (?-
|
||||
13 POSIX named classes are supported only within a class
|
||||
14 missing )
|
||||
15 reference to non-existent subpattern
|
||||
16 erroffset passed as NULL
|
||||
17 unknown option bit(s) set
|
||||
18 missing ) after comment
|
||||
19 parentheses nested too deeply
|
||||
20 regular expression too large
|
||||
19 [this code is not in use]
|
||||
20 regular expression is too large
|
||||
21 failed to get memory
|
||||
22 unmatched parentheses
|
||||
23 internal error: code overflow
|
||||
@@ -637,11 +706,11 @@ both compiling functions.
|
||||
26 malformed number or name after (?(
|
||||
27 conditional group contains more than two branches
|
||||
28 assertion expected after (?(
|
||||
29 (?R or (?digits must be followed by )
|
||||
29 (?R or (?[+-]digits must be followed by )
|
||||
30 unknown POSIX class name
|
||||
31 POSIX collating elements are not supported
|
||||
32 this version of PCRE is not compiled with PCRE_UTF8 support
|
||||
33 spare error
|
||||
33 [this code is not in use]
|
||||
34 character value in \ex{...} sequence is too large
|
||||
35 invalid condition (?(0)
|
||||
36 \eC not allowed in lookbehind assertion
|
||||
@@ -650,16 +719,33 @@ both compiling functions.
|
||||
39 closing ) for (?C expected
|
||||
40 recursive call could loop indefinitely
|
||||
41 unrecognized character after (?P
|
||||
42 syntax error after (?P
|
||||
42 syntax error in subpattern name (missing terminator)
|
||||
43 two named subpatterns have the same name
|
||||
44 invalid UTF-8 string
|
||||
45 support for \eP, \ep, and \eX has not been compiled
|
||||
46 malformed \eP or \ep sequence
|
||||
47 unknown property name after \eP or \ep
|
||||
48 subpattern name is too long (maximum 32 characters)
|
||||
49 too many named subpatterns (maximum 10,000)
|
||||
50 repeated subpattern is too long
|
||||
49 too many named subpatterns (maximum 10000)
|
||||
50 [this code is not in use]
|
||||
51 octal value is greater than \e377 (not in UTF-8 mode)
|
||||
52 internal error: overran compiling workspace
|
||||
53 internal error: previously-checked referenced subpattern not found
|
||||
54 DEFINE group contains more than one branch
|
||||
55 repeating a DEFINE group is not allowed
|
||||
56 inconsistent NEWLINE options
|
||||
57 \eg is not followed by a braced, angle-bracketed, or quoted
|
||||
name/number or by a plain number
|
||||
58 a numbered reference must not be zero
|
||||
59 (*VERB) with an argument is not supported
|
||||
60 (*VERB) not recognized
|
||||
61 number is too big
|
||||
62 subpattern name expected
|
||||
63 digit expected after (?+
|
||||
64 ] is an invalid data character in JavaScript compatibility mode
|
||||
.sp
|
||||
The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
|
||||
be used if the limits were changed when PCRE was built.
|
||||
.
|
||||
.
|
||||
.SH "STUDYING A PATTERN"
|
||||
@@ -719,19 +805,25 @@ bytes is created.
|
||||
.SH "LOCALE SUPPORT"
|
||||
.rs
|
||||
.sp
|
||||
PCRE handles caseless matching, and determines whether characters are letters
|
||||
PCRE handles caseless matching, and determines whether characters are letters,
|
||||
digits, or whatever, by reference to a set of tables, indexed by character
|
||||
value. When running in UTF-8 mode, this applies only to characters with codes
|
||||
less than 128. Higher-valued codes never match escapes such as \ew or \ed, but
|
||||
can be tested with \ep if PCRE is built with Unicode character property
|
||||
support. The use of locales with Unicode is discouraged.
|
||||
support. The use of locales with Unicode is discouraged. If you are handling
|
||||
characters with codes greater than 128, you should either use UTF-8 and
|
||||
Unicode, or use locales, but not try to mix the two.
|
||||
.P
|
||||
An internal set of tables is created in the default C locale when PCRE is
|
||||
built. This is used when the final argument of \fBpcre_compile()\fP is NULL,
|
||||
and is sufficient for many applications. An alternative set of tables can,
|
||||
however, be supplied. These may be created in a different locale from the
|
||||
default. As more and more applications change to using Unicode, the need for
|
||||
this locale support is expected to die away.
|
||||
PCRE contains an internal set of tables that are used when the final argument
|
||||
of \fBpcre_compile()\fP is NULL. These are sufficient for many applications.
|
||||
Normally, the internal tables recognize only ASCII characters. However, when
|
||||
PCRE is built, it is possible to cause the internal tables to be rebuilt in the
|
||||
default "C" locale of the local system, which may cause them to be different.
|
||||
.P
|
||||
The internal tables can always be overridden by tables supplied by the
|
||||
application that calls PCRE. These may be created in a different locale from
|
||||
the default. As more and more applications change to using Unicode, the need
|
||||
for this locale support is expected to die away.
|
||||
.P
|
||||
External tables are built by calling the \fBpcre_maketables()\fP function,
|
||||
which has no arguments, in the relevant locale. The result can then be passed
|
||||
@@ -744,6 +836,9 @@ the following code could be used:
|
||||
tables = pcre_maketables();
|
||||
re = pcre_compile(..., tables);
|
||||
.sp
|
||||
The locale name "fr_FR" is used on Linux and other Unix-like systems; if you
|
||||
are using Windows, the name for the French locale is "french".
|
||||
.P
|
||||
When \fBpcre_maketables()\fP runs, the tables are built in memory that is
|
||||
obtained via \fBpcre_malloc\fP. It is the caller's responsibility to ensure
|
||||
that the memory containing the tables remains available for as long as it is
|
||||
@@ -827,7 +922,7 @@ variable. (This option used to be called PCRE_INFO_FIRSTCHAR; the old name is
|
||||
still recognized for backwards compatibility.)
|
||||
.P
|
||||
If there is a fixed first byte, for example, from a pattern such as
|
||||
(cat|cow|coyote). Otherwise, if either
|
||||
(cat|cow|coyote), its value is returned. Otherwise, if either
|
||||
.sp
|
||||
(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
|
||||
starts with "^", or
|
||||
@@ -845,6 +940,18 @@ If the pattern was studied, and this resulted in the construction of a 256-bit
|
||||
table indicating a fixed set of bytes for the first byte in any matching
|
||||
string, a pointer to the table is returned. Otherwise NULL is returned. The
|
||||
fourth argument should point to an \fBunsigned char *\fP variable.
|
||||
.sp
|
||||
PCRE_INFO_HASCRORLF
|
||||
.sp
|
||||
Return 1 if the pattern contains any explicit matches for CR or LF characters,
|
||||
otherwise 0. The fourth argument should point to an \fBint\fP variable. An
|
||||
explicit match is either a literal CR or LF character, or \er or \en.
|
||||
.sp
|
||||
PCRE_INFO_JCHANGED
|
||||
.sp
|
||||
Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise
|
||||
0. The fourth argument should point to an \fBint\fP variable. (?J) and
|
||||
(?-J) set and unset the local PCRE_DUPNAMES option, respectively.
|
||||
.sp
|
||||
PCRE_INFO_LASTLITERAL
|
||||
.sp
|
||||
@@ -882,8 +989,8 @@ their parentheses numbers. For example, consider the following pattern (assume
|
||||
PCRE_EXTENDED is set, so white space - including newlines - is ignored):
|
||||
.sp
|
||||
.\" JOIN
|
||||
(?P<date> (?P<year>(\ed\ed)?\ed\ed) -
|
||||
(?P<month>\ed\ed) - (?P<day>\ed\ed) )
|
||||
(?<date> (?<year>(\ed\ed)?\ed\ed) -
|
||||
(?<month>\ed\ed) - (?<day>\ed\ed) )
|
||||
.sp
|
||||
There are four named subpatterns, so the table has four entries, and each entry
|
||||
in the table is eight bytes long. The table is as follows, with non-printing
|
||||
@@ -897,13 +1004,26 @@ bytes shows in hexadecimal, and undefined bytes shown as ??:
|
||||
When writing code to extract data from named subpatterns using the
|
||||
name-to-number map, remember that the length of the entries is likely to be
|
||||
different for each compiled pattern.
|
||||
.sp
|
||||
PCRE_INFO_OKPARTIAL
|
||||
.sp
|
||||
Return 1 if the pattern can be used for partial matching, otherwise 0. The
|
||||
fourth argument should point to an \fBint\fP variable. The
|
||||
.\" HREF
|
||||
\fBpcrepartial\fP
|
||||
.\"
|
||||
documentation lists the restrictions that apply to patterns when partial
|
||||
matching is used.
|
||||
.sp
|
||||
PCRE_INFO_OPTIONS
|
||||
.sp
|
||||
Return a copy of the options with which the pattern was compiled. The fourth
|
||||
argument should point to an \fBunsigned long int\fP variable. These option bits
|
||||
are those specified in the call to \fBpcre_compile()\fP, modified by any
|
||||
top-level option settings within the pattern itself.
|
||||
top-level option settings at the start of the pattern itself. In other words,
|
||||
they are the options that will be in force when matching starts. For example,
|
||||
if the pattern /(?im)abc(?-i)d/ is compiled with the PCRE_EXTENDED option, the
|
||||
result is PCRE_CASELESS, PCRE_MULTILINE, and PCRE_EXTENDED.
|
||||
.P
|
||||
A pattern is automatically anchored by PCRE if all of its top-level
|
||||
alternatives begin with one of the following:
|
||||
@@ -1114,12 +1234,14 @@ called. See the
|
||||
.\"
|
||||
documentation for a discussion of saving compiled patterns for later use.
|
||||
.
|
||||
.\" HTML <a name="execoptions"></a>
|
||||
.SS "Option bits for \fBpcre_exec()\fP"
|
||||
.rs
|
||||
.sp
|
||||
The unused bits of the \fIoptions\fP argument for \fBpcre_exec()\fP must be
|
||||
zero. The only bits that may be set are PCRE_ANCHORED, PCRE_NEWLINE_\fIxxx\fP,
|
||||
PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_UTF8_CHECK and PCRE_PARTIAL.
|
||||
PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NO_START_OPTIMIZE,
|
||||
PCRE_NO_UTF8_CHECK and PCRE_PARTIAL.
|
||||
.sp
|
||||
PCRE_ANCHORED
|
||||
.sp
|
||||
@@ -1127,15 +1249,48 @@ The PCRE_ANCHORED option limits \fBpcre_exec()\fP to matching at the first
|
||||
matching position. If a pattern was compiled with PCRE_ANCHORED, or turned out
|
||||
to be anchored by virtue of its contents, it cannot be made unachored at
|
||||
matching time.
|
||||
.sp
|
||||
PCRE_BSR_ANYCRLF
|
||||
PCRE_BSR_UNICODE
|
||||
.sp
|
||||
These options (which are mutually exclusive) control what the \eR escape
|
||||
sequence matches. The choice is either to match only CR, LF, or CRLF, or to
|
||||
match any Unicode newline sequence. These options override the choice that was
|
||||
made or defaulted when the pattern was compiled.
|
||||
.sp
|
||||
PCRE_NEWLINE_CR
|
||||
PCRE_NEWLINE_LF
|
||||
PCRE_NEWLINE_CRLF
|
||||
PCRE_NEWLINE_ANYCRLF
|
||||
PCRE_NEWLINE_ANY
|
||||
.sp
|
||||
These options override the newline definition that was chosen or defaulted when
|
||||
the pattern was compiled. For details, see the description \fBpcre_compile()\fP
|
||||
above. During matching, the newline choice affects the behaviour of the dot,
|
||||
circumflex, and dollar metacharacters.
|
||||
the pattern was compiled. For details, see the description of
|
||||
\fBpcre_compile()\fP above. During matching, the newline choice affects the
|
||||
behaviour of the dot, circumflex, and dollar metacharacters. It may also alter
|
||||
the way the match position is advanced after a match failure for an unanchored
|
||||
pattern.
|
||||
.P
|
||||
When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is set, and a
|
||||
match attempt for an unanchored pattern fails when the current position is at a
|
||||
CRLF sequence, and the pattern contains no explicit matches for CR or LF
|
||||
characters, the match position is advanced by two characters instead of one, in
|
||||
other words, to after the CRLF.
|
||||
.P
|
||||
The above rule is a compromise that makes the most common cases work as
|
||||
expected. For example, if the pattern is .+A (and the PCRE_DOTALL option is not
|
||||
set), it does not match the string "\er\enA" because, after failing at the
|
||||
start, it skips both the CR and the LF before retrying. However, the pattern
|
||||
[\er\en]A does match that string, because it contains an explicit CR or LF
|
||||
reference, and so advances only by one character after the first failure.
|
||||
.P
|
||||
An explicit match for CR of LF is either a literal appearance of one of those
|
||||
characters, or one of the \er or \en escape sequences. Implicit matches such as
|
||||
[^X] do not count, nor does \es (which includes CR and LF in the characters
|
||||
that it matches).
|
||||
.P
|
||||
Notwithstanding the above, anomalous effects may still occur when CRLF is a
|
||||
valid newline sequence and explicit \er or \en escapes appear in the pattern.
|
||||
.sp
|
||||
PCRE_NOTBOL
|
||||
.sp
|
||||
@@ -1172,15 +1327,35 @@ matching a null string by first trying the match again at the same offset with
|
||||
PCRE_NOTEMPTY and PCRE_ANCHORED, and then if that fails by advancing the
|
||||
starting offset (see below) and trying an ordinary match again. There is some
|
||||
code that demonstrates how to do this in the \fIpcredemo.c\fP sample program.
|
||||
.sp
|
||||
PCRE_NO_START_OPTIMIZE
|
||||
.sp
|
||||
There are a number of optimizations that \fBpcre_exec()\fP uses at the start of
|
||||
a match, in order to speed up the process. For example, if it is known that a
|
||||
match must start with a specific character, it searches the subject for that
|
||||
character, and fails immediately if it cannot find it, without actually running
|
||||
the main matching function. When callouts are in use, these optimizations can
|
||||
cause them to be skipped. This option disables the "start-up" optimizations,
|
||||
causing performance to suffer, but ensuring that the callouts do occur.
|
||||
.sp
|
||||
PCRE_NO_UTF8_CHECK
|
||||
.sp
|
||||
When PCRE_UTF8 is set at compile time, the validity of the subject as a UTF-8
|
||||
string is automatically checked when \fBpcre_exec()\fP is subsequently called.
|
||||
The value of \fIstartoffset\fP is also checked to ensure that it points to the
|
||||
start of a UTF-8 character. If an invalid UTF-8 sequence of bytes is found,
|
||||
\fBpcre_exec()\fP returns the error PCRE_ERROR_BADUTF8. If \fIstartoffset\fP
|
||||
contains an invalid value, PCRE_ERROR_BADUTF8_OFFSET is returned.
|
||||
start of a UTF-8 character. There is a discussion about the validity of UTF-8
|
||||
strings in the
|
||||
.\" HTML <a href="pcre.html#utf8strings">
|
||||
.\" </a>
|
||||
section on UTF-8 support
|
||||
.\"
|
||||
in the main
|
||||
.\" HREF
|
||||
\fBpcre\fP
|
||||
.\"
|
||||
page. If an invalid UTF-8 sequence of bytes is found, \fBpcre_exec()\fP returns
|
||||
the error PCRE_ERROR_BADUTF8. If \fIstartoffset\fP contains an invalid value,
|
||||
PCRE_ERROR_BADUTF8_OFFSET is returned.
|
||||
.P
|
||||
If you already know that your subject is valid, and you want to skip these
|
||||
checks for performance reasons, you can set the PCRE_NO_UTF8_CHECK option when
|
||||
@@ -1210,11 +1385,11 @@ documentation.
|
||||
.rs
|
||||
.sp
|
||||
The subject string is passed to \fBpcre_exec()\fP as a pointer in
|
||||
\fIsubject\fP, a length in \fIlength\fP, and a starting byte offset in
|
||||
\fIstartoffset\fP. In UTF-8 mode, the byte offset must point to the start of a
|
||||
UTF-8 character. Unlike the pattern string, the subject may contain binary zero
|
||||
bytes. When the starting offset is zero, the search for a match starts at the
|
||||
beginning of the subject, and this is by far the most common case.
|
||||
\fIsubject\fP, a length (in bytes) in \fIlength\fP, and a starting byte offset
|
||||
in \fIstartoffset\fP. In UTF-8 mode, the byte offset must point to the start of
|
||||
a UTF-8 character. Unlike the pattern string, the subject may contain binary
|
||||
zero bytes. When the starting offset is zero, the search for a match starts at
|
||||
the beginning of the subject, and this is by far the most common case.
|
||||
.P
|
||||
A non-zero starting offset is useful when searching for another match in the
|
||||
same subject by calling \fBpcre_exec()\fP again after a previous success.
|
||||
@@ -1248,38 +1423,41 @@ pattern. Following the usage in Jeffrey Friedl's book, this is called
|
||||
a fragment of a pattern that picks out a substring. PCRE supports several other
|
||||
kinds of parenthesized subpattern that do not cause substrings to be captured.
|
||||
.P
|
||||
Captured substrings are returned to the caller via a vector of integer offsets
|
||||
whose address is passed in \fIovector\fP. The number of elements in the vector
|
||||
is passed in \fIovecsize\fP, which must be a non-negative number. \fBNote\fP:
|
||||
this argument is NOT the size of \fIovector\fP in bytes.
|
||||
Captured substrings are returned to the caller via a vector of integers whose
|
||||
address is passed in \fIovector\fP. The number of elements in the vector is
|
||||
passed in \fIovecsize\fP, which must be a non-negative number. \fBNote\fP: this
|
||||
argument is NOT the size of \fIovector\fP in bytes.
|
||||
.P
|
||||
The first two-thirds of the vector is used to pass back captured substrings,
|
||||
each substring using a pair of integers. The remaining third of the vector is
|
||||
used as workspace by \fBpcre_exec()\fP while matching capturing subpatterns,
|
||||
and is not available for passing back information. The length passed in
|
||||
and is not available for passing back information. The number passed in
|
||||
\fIovecsize\fP should always be a multiple of three. If it is not, it is
|
||||
rounded down.
|
||||
.P
|
||||
When a match is successful, information about captured substrings is returned
|
||||
in pairs of integers, starting at the beginning of \fIovector\fP, and
|
||||
continuing up to two-thirds of its length at the most. The first element of a
|
||||
pair is set to the offset of the first character in a substring, and the second
|
||||
is set to the offset of the first character after the end of a substring. The
|
||||
first pair, \fIovector[0]\fP and \fIovector[1]\fP, identify the portion of the
|
||||
subject string matched by the entire pattern. The next pair is used for the
|
||||
first capturing subpattern, and so on. The value returned by \fBpcre_exec()\fP
|
||||
is one more than the highest numbered pair that has been set. For example, if
|
||||
two substrings have been captured, the returned value is 3. If there are no
|
||||
capturing subpatterns, the return value from a successful match is 1,
|
||||
indicating that just the first pair of offsets has been set.
|
||||
continuing up to two-thirds of its length at the most. The first element of
|
||||
each pair is set to the byte offset of the first character in a substring, and
|
||||
the second is set to the byte offset of the first character after the end of a
|
||||
substring. \fBNote\fP: these values are always byte offsets, even in UTF-8
|
||||
mode. They are not character counts.
|
||||
.P
|
||||
The first pair of integers, \fIovector[0]\fP and \fIovector[1]\fP, identify the
|
||||
portion of the subject string matched by the entire pattern. The next pair is
|
||||
used for the first capturing subpattern, and so on. The value returned by
|
||||
\fBpcre_exec()\fP is one more than the highest numbered pair that has been set.
|
||||
For example, if two substrings have been captured, the returned value is 3. If
|
||||
there are no capturing subpatterns, the return value from a successful match is
|
||||
1, indicating that just the first pair of offsets has been set.
|
||||
.P
|
||||
If a capturing subpattern is matched repeatedly, it is the last portion of the
|
||||
string that it matched that is returned.
|
||||
.P
|
||||
If the vector is too small to hold all the captured substring offsets, it is
|
||||
used as far as possible (up to two-thirds of its length), and the function
|
||||
returns a value of zero. In particular, if the substring offsets are not of
|
||||
interest, \fBpcre_exec()\fP may be called with \fIovector\fP passed as NULL and
|
||||
returns a value of zero. If the substring offsets are not of interest,
|
||||
\fBpcre_exec()\fP may be called with \fIovector\fP passed as NULL and
|
||||
\fIovecsize\fP as zero. However, if the pattern contains back references and
|
||||
the \fIovector\fP is not big enough to remember the related substrings, PCRE
|
||||
has to get additional memory for use during matching. Thus it is usually
|
||||
@@ -1336,7 +1514,7 @@ compiled in an environment of one endianness is run in an environment with the
|
||||
other endianness. This is the error that PCRE gives when the magic number is
|
||||
not present.
|
||||
.sp
|
||||
PCRE_ERROR_UNKNOWN_NODE (-5)
|
||||
PCRE_ERROR_UNKNOWN_OPCODE (-5)
|
||||
.sp
|
||||
While running the pattern match, an unknown item was encountered in the
|
||||
compiled pattern. This error could be caused by a bug in PCRE or by overwriting
|
||||
@@ -1361,12 +1539,6 @@ below). It is never returned by \fBpcre_exec()\fP.
|
||||
The backtracking limit, as specified by the \fImatch_limit\fP field in a
|
||||
\fBpcre_extra\fP structure (or defaulted) was reached. See the description
|
||||
above.
|
||||
.sp
|
||||
PCRE_ERROR_RECURSIONLIMIT (-21)
|
||||
.sp
|
||||
The internal recursion limit, as specified by the \fImatch_limit_recursion\fP
|
||||
field in a \fBpcre_extra\fP structure (or defaulted) was reached. See the
|
||||
description above.
|
||||
.sp
|
||||
PCRE_ERROR_CALLOUT (-9)
|
||||
.sp
|
||||
@@ -1411,6 +1583,18 @@ in PCRE or by overwriting of the compiled pattern.
|
||||
PCRE_ERROR_BADCOUNT (-15)
|
||||
.sp
|
||||
This error is given if the value of the \fIovecsize\fP argument is negative.
|
||||
.sp
|
||||
PCRE_ERROR_RECURSIONLIMIT (-21)
|
||||
.sp
|
||||
The internal recursion limit, as specified by the \fImatch_limit_recursion\fP
|
||||
field in a \fBpcre_extra\fP structure (or defaulted) was reached. See the
|
||||
description above.
|
||||
.sp
|
||||
PCRE_ERROR_BADNEWLINE (-23)
|
||||
.sp
|
||||
An invalid combination of PCRE_NEWLINE_\fIxxx\fP options was given.
|
||||
.P
|
||||
Error numbers -16 to -20 and -22 are not used by \fBpcre_exec()\fP.
|
||||
.
|
||||
.
|
||||
.SH "EXTRACTING CAPTURED SUBSTRINGS BY NUMBER"
|
||||
@@ -1422,14 +1606,12 @@ This error is given if the value of the \fIovecsize\fP argument is negative.
|
||||
.ti +5n
|
||||
.B int \fIbuffersize\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_get_substring(const char *\fIsubject\fP, int *\fIovector\fP,
|
||||
.ti +5n
|
||||
.B int \fIstringcount\fP, int \fIstringnumber\fP,
|
||||
.ti +5n
|
||||
.B const char **\fIstringptr\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_get_substring_list(const char *\fIsubject\fP,
|
||||
.ti +5n
|
||||
.B int *\fIovector\fP, int \fIstringcount\fP, "const char ***\fIlistptr\fP);"
|
||||
@@ -1468,7 +1650,7 @@ the string is placed in \fIbuffer\fP, whose length is given by
|
||||
\fIbuffersize\fP, while for \fBpcre_get_substring()\fP a new block of memory is
|
||||
obtained via \fBpcre_malloc\fP, and its address is returned via
|
||||
\fIstringptr\fP. The yield of the function is the length of the string, not
|
||||
including the terminating zero, or one of
|
||||
including the terminating zero, or one of these error codes:
|
||||
.sp
|
||||
PCRE_ERROR_NOMEMORY (-6)
|
||||
.sp
|
||||
@@ -1484,7 +1666,7 @@ and builds a list of pointers to them. All this is done in a single block of
|
||||
memory that is obtained via \fBpcre_malloc\fP. The address of the memory block
|
||||
is returned via \fIlistptr\fP, which is also the start of the list of string
|
||||
pointers. The end of the list is marked by a NULL pointer. The yield of the
|
||||
function is zero if all went well, or
|
||||
function is zero if all went well, or the error code
|
||||
.sp
|
||||
PCRE_ERROR_NOMEMORY (-6)
|
||||
.sp
|
||||
@@ -1515,7 +1697,6 @@ provided.
|
||||
.ti +5n
|
||||
.B const char *\fIname\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_copy_named_substring(const pcre *\fIcode\fP,
|
||||
.ti +5n
|
||||
.B const char *\fIsubject\fP, int *\fIovector\fP,
|
||||
@@ -1524,7 +1705,6 @@ provided.
|
||||
.ti +5n
|
||||
.B char *\fIbuffer\fP, int \fIbuffersize\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int pcre_get_named_substring(const pcre *\fIcode\fP,
|
||||
.ti +5n
|
||||
.B const char *\fIsubject\fP, int *\fIovector\fP,
|
||||
@@ -1536,7 +1716,7 @@ provided.
|
||||
To extract a substring by name, you first have to find associated number.
|
||||
For example, for this pattern
|
||||
.sp
|
||||
(a+)b(?P<xxx>\ed+)...
|
||||
(a+)b(?<xxx>\ed+)...
|
||||
.sp
|
||||
the number of the subpattern called "xxx" is 2. If the name is known to be
|
||||
unique (PCRE_DUPNAMES was not set), you can find the number from the name by
|
||||
@@ -1560,9 +1740,14 @@ pattern. This is needed in order to gain access to the name-to-number
|
||||
translation table.
|
||||
.P
|
||||
These functions call \fBpcre_get_stringnumber()\fP, and if it succeeds, they
|
||||
then call \fIpcre_copy_substring()\fP or \fIpcre_get_substring()\fP, as
|
||||
appropriate.
|
||||
.
|
||||
then call \fBpcre_copy_substring()\fP or \fBpcre_get_substring()\fP, as
|
||||
appropriate. \fBNOTE:\fP If PCRE_DUPNAMES is set and there are duplicate names,
|
||||
the behaviour may not be what you want (see the next section).
|
||||
.P
|
||||
\fBWarning:\fP If the pattern uses the "(?|" feature to set up multiple
|
||||
subpatterns with the same number, you cannot use names to distinguish them,
|
||||
because names are not included in the compiled code. The matching process uses
|
||||
only numbers.
|
||||
.
|
||||
.SH "DUPLICATE SUBPATTERN NAMES"
|
||||
.rs
|
||||
@@ -1578,22 +1763,25 @@ example is shown in the
|
||||
.\" HREF
|
||||
\fBpcrepattern\fP
|
||||
.\"
|
||||
documentation. When duplicates are present, \fBpcre_copy_named_substring()\fP
|
||||
and \fBpcre_get_named_substring()\fP return the first substring corresponding
|
||||
to the given name that is set. If none are set, an empty string is returned.
|
||||
The \fBpcre_get_stringnumber()\fP function returns one of the numbers that are
|
||||
associated with the name, but it is not defined which it is.
|
||||
.sp
|
||||
documentation.
|
||||
.P
|
||||
When duplicates are present, \fBpcre_copy_named_substring()\fP and
|
||||
\fBpcre_get_named_substring()\fP return the first substring corresponding to
|
||||
the given name that is set. If none are set, PCRE_ERROR_NOSUBSTRING (-7) is
|
||||
returned; no data is returned. The \fBpcre_get_stringnumber()\fP function
|
||||
returns one of the numbers that are associated with the name, but it is not
|
||||
defined which it is.
|
||||
.P
|
||||
If you want to get full details of all captured substrings for a given name,
|
||||
you must use the \fBpcre_get_stringtable_entries()\fP function. The first
|
||||
argument is the compiled pattern, and the second is the name. The third and
|
||||
fourth are pointers to variables which are updated by the function. After it
|
||||
has run, they point to the first and last entries in the name-to-number table
|
||||
for the given name. The function itself returns the length of each entry, or
|
||||
PCRE_ERROR_NOSUBSTRING if there are none. The format of the table is described
|
||||
above in the section entitled \fIInformation about a pattern\fP. Given all the
|
||||
relevant entries for the name, you can extract each of their numbers, and hence
|
||||
the captured data, if any.
|
||||
PCRE_ERROR_NOSUBSTRING (-7) if there are none. The format of the table is
|
||||
described above in the section entitled \fIInformation about a pattern\fP.
|
||||
Given all the relevant entries for the name, you can extract each of their
|
||||
numbers, and hence the captured data, if any.
|
||||
.
|
||||
.
|
||||
.SH "FINDING ALL POSSIBLE MATCHES"
|
||||
@@ -1631,11 +1819,12 @@ will yield PCRE_ERROR_NOMATCH.
|
||||
.B int *\fIworkspace\fP, int \fIwscount\fP);
|
||||
.P
|
||||
The function \fBpcre_dfa_exec()\fP is called to match a subject string against
|
||||
a compiled pattern, using a "DFA" matching algorithm. This has different
|
||||
characteristics to the normal algorithm, and is not compatible with Perl. Some
|
||||
of the features of PCRE patterns are not supported. Nevertheless, there are
|
||||
times when this kind of matching can be useful. For a discussion of the two
|
||||
matching algorithms, see the
|
||||
a compiled pattern, using a matching algorithm that scans the subject string
|
||||
just once, and does not backtrack. This has different characteristics to the
|
||||
normal algorithm, and is not compatible with Perl. Some of the features of PCRE
|
||||
patterns are not supported. Nevertheless, there are times when this kind of
|
||||
matching can be useful. For a discussion of the two matching algorithms, see
|
||||
the
|
||||
.\" HREF
|
||||
\fBpcrematching\fP
|
||||
.\"
|
||||
@@ -1691,9 +1880,9 @@ matching string.
|
||||
PCRE_DFA_SHORTEST
|
||||
.sp
|
||||
Setting the PCRE_DFA_SHORTEST option causes the matching algorithm to stop as
|
||||
soon as it has found one match. Because of the way the DFA algorithm works,
|
||||
this is necessarily the shortest possible match at the first possible matching
|
||||
point in the subject string.
|
||||
soon as it has found one match. Because of the way the alternative algorithm
|
||||
works, this is necessarily the shortest possible match at the first possible
|
||||
matching point in the subject string.
|
||||
.sp
|
||||
PCRE_DFA_RESTART
|
||||
.sp
|
||||
@@ -1732,10 +1921,10 @@ the three matched strings are
|
||||
On success, the yield of the function is a number greater than zero, which is
|
||||
the number of matched substrings. The substrings themselves are returned in
|
||||
\fIovector\fP. Each string uses two elements; the first is the offset to the
|
||||
start, and the second is the offset to the end. All the strings have the same
|
||||
start offset. (Space could have been saved by giving this only once, but it was
|
||||
decided to retain some compatibility with the way \fBpcre_exec()\fP returns
|
||||
data, even though the meaning of the strings is different.)
|
||||
start, and the second is the offset to the end. In fact, all the strings have
|
||||
the same start offset. (Space could have been saved by giving this only once,
|
||||
but it was decided to retain some compatibility with the way \fBpcre_exec()\fP
|
||||
returns data, even though the meaning of the strings is different.)
|
||||
.P
|
||||
The strings are returned in reverse order of length; that is, the longest
|
||||
matching string is given first. If there were too many matches to fit into
|
||||
@@ -1762,8 +1951,9 @@ that it does not support, for instance, the use of \eC or a back reference.
|
||||
.sp
|
||||
PCRE_ERROR_DFA_UCOND (-17)
|
||||
.sp
|
||||
This return is given if \fBpcre_dfa_exec()\fP encounters a condition item in a
|
||||
pattern that uses a back reference for the condition. This is not supported.
|
||||
This return is given if \fBpcre_dfa_exec()\fP encounters a condition item that
|
||||
uses a back reference for the condition, or a test for recursion in a specific
|
||||
group. These are not supported.
|
||||
.sp
|
||||
PCRE_ERROR_DFA_UMLIMIT (-18)
|
||||
.sp
|
||||
@@ -1782,8 +1972,30 @@ When a recursive subpattern is processed, the matching function calls itself
|
||||
recursively, using private vectors for \fIovector\fP and \fIworkspace\fP. This
|
||||
error is given if the output vector is not large enough. This should be
|
||||
extremely rare, as a vector of size 1000 is used.
|
||||
.P
|
||||
.in 0
|
||||
Last updated: 08 June 2006
|
||||
.br
|
||||
Copyright (c) 1997-2006 University of Cambridge.
|
||||
.
|
||||
.
|
||||
.SH "SEE ALSO"
|
||||
.rs
|
||||
.sp
|
||||
\fBpcrebuild\fP(3), \fBpcrecallout\fP(3), \fBpcrecpp(3)\fP(3),
|
||||
\fBpcrematching\fP(3), \fBpcrepartial\fP(3), \fBpcreposix\fP(3),
|
||||
\fBpcreprecompile\fP(3), \fBpcresample\fP(3), \fBpcrestack\fP(3).
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Cambridge CB2 3QH, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 11 April 2009
|
||||
Copyright (c) 1997-2009 University of Cambridge.
|
||||
.fi
|
||||
|
||||
+155
-34
@@ -5,16 +5,21 @@ PCRE - Perl-compatible regular expressions
|
||||
.rs
|
||||
.sp
|
||||
This document describes the optional features of PCRE that can be selected when
|
||||
the library is compiled. They are all selected, or deselected, by providing
|
||||
options to the \fBconfigure\fP script that is run before the \fBmake\fP
|
||||
command. The complete list of options for \fBconfigure\fP (which includes the
|
||||
standard ones such as the selection of the installation directory) can be
|
||||
obtained by running
|
||||
the library is compiled. It assumes use of the \fBconfigure\fP script, where
|
||||
the optional features are selected or deselected by providing options to
|
||||
\fBconfigure\fP before running the \fBmake\fP command. However, the same
|
||||
options can be selected in both Unix-like and non-Unix-like environments using
|
||||
the GUI facility of \fBCMakeSetup\fP if you are using \fBCMake\fP instead of
|
||||
\fBconfigure\fP to build PCRE.
|
||||
.P
|
||||
The complete list of options for \fBconfigure\fP (which includes the standard
|
||||
ones such as the selection of the installation directory) can be obtained by
|
||||
running
|
||||
.sp
|
||||
./configure --help
|
||||
.sp
|
||||
The following sections describe certain options whose names begin with --enable
|
||||
or --disable. These settings specify changes to the defaults for the
|
||||
The following sections include descriptions of options whose names begin with
|
||||
--enable or --disable. These settings specify changes to the defaults for the
|
||||
\fBconfigure\fP command. Because of the way that \fBconfigure\fP works,
|
||||
--enable and --disable always come in pairs, so the complementary option always
|
||||
exists as well, but as it specifies the default, it is not described.
|
||||
@@ -33,7 +38,7 @@ to the \fBconfigure\fP command.
|
||||
.SH "UTF-8 SUPPORT"
|
||||
.rs
|
||||
.sp
|
||||
To build PCRE with support for UTF-8 character strings, add
|
||||
To build PCRE with support for UTF-8 Unicode character strings, add
|
||||
.sp
|
||||
--enable-utf8
|
||||
.sp
|
||||
@@ -41,6 +46,12 @@ to the \fBconfigure\fP command. Of itself, this does not make PCRE treat
|
||||
strings as UTF-8. As well as compiling PCRE with this option, you also have
|
||||
have to set the PCRE_UTF8 option when you call the \fBpcre_compile()\fP
|
||||
function.
|
||||
.P
|
||||
If you set --enable-utf8 when compiling in an EBCDIC environment, PCRE expects
|
||||
its input to be either ASCII or UTF-8 (depending on the runtime option). It is
|
||||
not possible to support both EBCDIC and UTF-8 codes in the same version of the
|
||||
library. Consequently, --enable-utf8 and --enable-ebcdic are mutually
|
||||
exclusive.
|
||||
.
|
||||
.SH "UNICODE CHARACTER PROPERTY SUPPORT"
|
||||
.rs
|
||||
@@ -56,9 +67,9 @@ character properties, you must add
|
||||
to the \fBconfigure\fP command. This implies UTF-8 support, even if you have
|
||||
not explicitly requested it.
|
||||
.P
|
||||
Including Unicode property support adds around 90K of tables to the PCRE
|
||||
library, approximately doubling its size. Only the general category properties
|
||||
such as \fILu\fP and \fINd\fP are supported. Details are given in the
|
||||
Including Unicode property support adds around 30K of tables to the PCRE
|
||||
library. Only the general category properties such as \fILu\fP and \fINd\fP are
|
||||
supported. Details are given in the
|
||||
.\" HREF
|
||||
\fBpcrepattern\fP
|
||||
.\"
|
||||
@@ -67,9 +78,9 @@ documentation.
|
||||
.SH "CODE VALUE OF NEWLINE"
|
||||
.rs
|
||||
.sp
|
||||
By default, PCRE interprets character 10 (linefeed, LF) as indicating the end
|
||||
By default, PCRE interprets the linefeed (LF) character as indicating the end
|
||||
of a line. This is the normal newline character on Unix-like systems. You can
|
||||
compile PCRE to use character 13 (carriage return, CR) instead, by adding
|
||||
compile PCRE to use carriage return (CR) instead, by adding
|
||||
.sp
|
||||
--enable-newline-is-cr
|
||||
.sp
|
||||
@@ -81,9 +92,32 @@ character sequence CRLF. If you want this, add
|
||||
.sp
|
||||
--enable-newline-is-crlf
|
||||
.sp
|
||||
to the \fBconfigure\fP command. Whatever line ending convention is selected
|
||||
when PCRE is built can be overridden when the library functions are called. At
|
||||
build time it is conventional to use the standard for your operating system.
|
||||
to the \fBconfigure\fP command. There is a fourth option, specified by
|
||||
.sp
|
||||
--enable-newline-is-anycrlf
|
||||
.sp
|
||||
which causes PCRE to recognize any of the three sequences CR, LF, or CRLF as
|
||||
indicating a line ending. Finally, a fifth option, specified by
|
||||
.sp
|
||||
--enable-newline-is-any
|
||||
.sp
|
||||
causes PCRE to recognize any Unicode newline sequence.
|
||||
.P
|
||||
Whatever line ending convention is selected when PCRE is built can be
|
||||
overridden when the library functions are called. At build time it is
|
||||
conventional to use the standard for your operating system.
|
||||
.
|
||||
.SH "WHAT \eR MATCHES"
|
||||
.rs
|
||||
.sp
|
||||
By default, the sequence \eR in a pattern matches any Unicode newline sequence,
|
||||
whatever has been selected as the line ending sequence. If you specify
|
||||
.sp
|
||||
--enable-bsr-anycrlf
|
||||
.sp
|
||||
the default is changed so that \eR matches only CR, LF, or CRLF. Whatever is
|
||||
selected when PCRE is built can be overridden when the library functions are
|
||||
called.
|
||||
.
|
||||
.SH "BUILDING SHARED AND STATIC LIBRARIES"
|
||||
.rs
|
||||
@@ -131,10 +165,6 @@ or four-byte offsets by adding a setting such as
|
||||
to the \fBconfigure\fP command. The value given must be 2, 3, or 4. Using
|
||||
longer offsets slows down the operation of PCRE because it has to load
|
||||
additional bytes when handling them.
|
||||
.P
|
||||
If you build PCRE with an increased link size, test 2 (and test 5 if you are
|
||||
using UTF-8) will fail. Part of the output of these tests is a representation
|
||||
of the compiled pattern, and this changes with the link size.
|
||||
.
|
||||
.SH "AVOIDING EXCESSIVE STACK USAGE"
|
||||
.rs
|
||||
@@ -157,13 +187,17 @@ build a version of PCRE that works this way, add
|
||||
.sp
|
||||
to the \fBconfigure\fP command. With this configuration, PCRE will use the
|
||||
\fBpcre_stack_malloc\fP and \fBpcre_stack_free\fP variables to call memory
|
||||
management functions. Separate functions are provided because the usage is very
|
||||
predictable: the block sizes requested are always the same, and the blocks are
|
||||
always freed in reverse order. A calling program might be able to implement
|
||||
optimized functions that perform better than the standard \fBmalloc()\fP and
|
||||
\fBfree()\fP functions. PCRE runs noticeably more slowly when built in this
|
||||
way. This option affects only the \fBpcre_exec()\fP function; it is not
|
||||
relevant for the the \fBpcre_dfa_exec()\fP function.
|
||||
management functions. By default these point to \fBmalloc()\fP and
|
||||
\fBfree()\fP, but you can replace the pointers so that your own functions are
|
||||
used.
|
||||
.P
|
||||
Separate functions are provided rather than using \fBpcre_malloc\fP and
|
||||
\fBpcre_free\fP because the usage is very predictable: the block sizes
|
||||
requested are always the same, and the blocks are always freed in reverse
|
||||
order. A calling program might be able to implement optimized functions that
|
||||
perform better than \fBmalloc()\fP and \fBfree()\fP. PCRE runs noticeably more
|
||||
slowly when built in this way. This option affects only the \fBpcre_exec()\fP
|
||||
function; it is not relevant for the the \fBpcre_dfa_exec()\fP function.
|
||||
.
|
||||
.SH "LIMITING PCRE RESOURCE USAGE"
|
||||
.rs
|
||||
@@ -196,18 +230,105 @@ constraints. However, you can set a lower limit by adding, for example,
|
||||
.sp
|
||||
to the \fBconfigure\fP command. This value can also be overridden at run time.
|
||||
.
|
||||
.SH "CREATING CHARACTER TABLES AT BUILD TIME"
|
||||
.rs
|
||||
.sp
|
||||
PCRE uses fixed tables for processing characters whose code values are less
|
||||
than 256. By default, PCRE is built with a set of tables that are distributed
|
||||
in the file \fIpcre_chartables.c.dist\fP. These tables are for ASCII codes
|
||||
only. If you add
|
||||
.sp
|
||||
--enable-rebuild-chartables
|
||||
.sp
|
||||
to the \fBconfigure\fP command, the distributed tables are no longer used.
|
||||
Instead, a program called \fBdftables\fP is compiled and run. This outputs the
|
||||
source for new set of tables, created in the default locale of your C runtime
|
||||
system. (This method of replacing the tables does not work if you are cross
|
||||
compiling, because \fBdftables\fP is run on the local host. If you need to
|
||||
create alternative tables when cross compiling, you will have to do so "by
|
||||
hand".)
|
||||
.
|
||||
.SH "USING EBCDIC CODE"
|
||||
.rs
|
||||
.sp
|
||||
PCRE assumes by default that it will run in an environment where the character
|
||||
code is ASCII (or Unicode, which is a superset of ASCII). PCRE can, however, be
|
||||
compiled to run in an EBCDIC environment by adding
|
||||
code is ASCII (or Unicode, which is a superset of ASCII). This is the case for
|
||||
most computer operating systems. PCRE can, however, be compiled to run in an
|
||||
EBCDIC environment by adding
|
||||
.sp
|
||||
--enable-ebcdic
|
||||
.sp
|
||||
to the \fBconfigure\fP command.
|
||||
to the \fBconfigure\fP command. This setting implies
|
||||
--enable-rebuild-chartables. You should only use it if you know that you are in
|
||||
an EBCDIC environment (for example, an IBM mainframe operating system). The
|
||||
--enable-ebcdic option is incompatible with --enable-utf8.
|
||||
.
|
||||
.SH "PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT"
|
||||
.rs
|
||||
.sp
|
||||
By default, \fBpcregrep\fP reads all files as plain text. You can build it so
|
||||
that it recognizes files whose names end in \fB.gz\fP or \fB.bz2\fP, and reads
|
||||
them with \fBlibz\fP or \fBlibbz2\fP, respectively, by adding one or both of
|
||||
.sp
|
||||
--enable-pcregrep-libz
|
||||
--enable-pcregrep-libbz2
|
||||
.sp
|
||||
to the \fBconfigure\fP command. These options naturally require that the
|
||||
relevant libraries are installed on your system. Configuration will fail if
|
||||
they are not.
|
||||
.
|
||||
.SH "PCRETEST OPTION FOR LIBREADLINE SUPPORT"
|
||||
.rs
|
||||
.sp
|
||||
If you add
|
||||
.sp
|
||||
--enable-pcretest-libreadline
|
||||
.sp
|
||||
to the \fBconfigure\fP command, \fBpcretest\fP is linked with the
|
||||
\fBlibreadline\fP library, and when its input is from a terminal, it reads it
|
||||
using the \fBreadline()\fP function. This provides line-editing and history
|
||||
facilities. Note that \fBlibreadline\fP is GPL-licenced, so if you distribute a
|
||||
binary of \fBpcretest\fP linked in this way, there may be licensing issues.
|
||||
.P
|
||||
.in 0
|
||||
Last updated: 06 June 2006
|
||||
.br
|
||||
Copyright (c) 1997-2006 University of Cambridge.
|
||||
Setting this option causes the \fB-lreadline\fP option to be added to the
|
||||
\fBpcretest\fP build. In many operating environments with a sytem-installed
|
||||
\fBlibreadline\fP this is sufficient. However, in some environments (e.g.
|
||||
if an unmodified distribution version of readline is in use), some extra
|
||||
configuration may be necessary. The INSTALL file for \fBlibreadline\fP says
|
||||
this:
|
||||
.sp
|
||||
"Readline uses the termcap functions, but does not link with the
|
||||
termcap or curses library itself, allowing applications which link
|
||||
with readline the to choose an appropriate library."
|
||||
.sp
|
||||
If your environment has not been set up so that an appropriate library is
|
||||
automatically included, you may need to add something like
|
||||
.sp
|
||||
LIBS="-ncurses"
|
||||
.sp
|
||||
immediately before the \fBconfigure\fP command.
|
||||
.
|
||||
.
|
||||
.SH "SEE ALSO"
|
||||
.rs
|
||||
.sp
|
||||
\fBpcreapi\fP(3), \fBpcre_config\fP(3).
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Cambridge CB2 3QH, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 17 March 2009
|
||||
Copyright (c) 1997-2009 University of Cambridge.
|
||||
.fi
|
||||
|
||||
+33
-11
@@ -17,7 +17,7 @@ function is to be called. Different callout points can be identified by putting
|
||||
a number less than 256 after the letter C. The default value is zero.
|
||||
For example, this pattern has two callout points:
|
||||
.sp
|
||||
(?C1)\deabc(?C2)def
|
||||
(?C1)abc(?C2)def
|
||||
.sp
|
||||
If the PCRE_AUTO_CALLOUT option bit is set when \fBpcre_compile()\fP is called,
|
||||
PCRE automatically inserts callouts, all with number 255, before each item in
|
||||
@@ -44,7 +44,8 @@ trying to optimize the performance of a particular pattern.
|
||||
.rs
|
||||
.sp
|
||||
You should be aware that, because of optimizations in the way PCRE matches
|
||||
patterns, callouts sometimes do not happen. For example, if the pattern is
|
||||
patterns by default, callouts sometimes do not happen. For example, if the
|
||||
pattern is
|
||||
.sp
|
||||
ab(?C4)cd
|
||||
.sp
|
||||
@@ -52,6 +53,11 @@ PCRE knows that any matching string must contain the letter "d". If the subject
|
||||
string is "abyz", the lack of "d" means that matching doesn't ever start, and
|
||||
the callout is never reached. However, with "abyd", though the result is still
|
||||
no match, the callout is obeyed.
|
||||
.P
|
||||
You can disable these optimizations by passing the PCRE_NO_START_OPTIMIZE
|
||||
option to \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP. This slows down the
|
||||
matching process, but does ensure that callouts such as the example above are
|
||||
obeyed.
|
||||
.
|
||||
.
|
||||
.SH "THE CALLOUT INTERFACE"
|
||||
@@ -95,10 +101,12 @@ not useful.
|
||||
The \fIsubject\fP and \fIsubject_length\fP fields contain copies of the values
|
||||
that were passed to \fBpcre_exec()\fP.
|
||||
.P
|
||||
The \fIstart_match\fP field contains the offset within the subject at which the
|
||||
current match attempt started. If the pattern is not anchored, the callout
|
||||
function may be called several times from the same point in the pattern for
|
||||
different starting points in the subject.
|
||||
The \fIstart_match\fP field normally contains the offset within the subject at
|
||||
which the current match attempt started. However, if the escape sequence \eK
|
||||
has been encountered, this value is changed to reflect the modified starting
|
||||
point. If the pattern is not anchored, the callout function may be called
|
||||
several times from the same point in the pattern for different starting points
|
||||
in the subject.
|
||||
.P
|
||||
The \fIcurrent_position\fP field contains the offset within the subject of the
|
||||
current match pointer.
|
||||
@@ -154,8 +162,22 @@ Negative values should normally be chosen from the set of PCRE_ERROR_xxx
|
||||
values. In particular, PCRE_ERROR_NOMATCH forces a standard "no match" failure.
|
||||
The error number PCRE_ERROR_CALLOUT is reserved for use by callout functions;
|
||||
it will never be used by PCRE itself.
|
||||
.P
|
||||
.in 0
|
||||
Last updated: 28 February 2005
|
||||
.br
|
||||
Copyright (c) 1997-2005 University of Cambridge.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Cambridge CB2 3QH, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 15 March 2009
|
||||
Copyright (c) 1997-2009 University of Cambridge.
|
||||
.fi
|
||||
|
||||
+51
-29
@@ -5,8 +5,9 @@ PCRE - Perl-compatible regular expressions
|
||||
.rs
|
||||
.sp
|
||||
This document describes the differences in the ways that PCRE and Perl handle
|
||||
regular expressions. The differences described here are with respect to Perl
|
||||
5.8.
|
||||
regular expressions. The differences described here are mainly with respect to
|
||||
Perl 5.8, though PCRE versions 7.0 and later contain some features that are
|
||||
expected to be in the forthcoming Perl 5.10.
|
||||
.P
|
||||
1. PCRE has only a subset of Perl's UTF-8 and Unicode support. Details of what
|
||||
it does have are given in the
|
||||
@@ -63,20 +64,32 @@ following examples:
|
||||
.sp
|
||||
The \eQ...\eE sequence is recognized both inside and outside character classes.
|
||||
.P
|
||||
8. Fairly obviously, PCRE does not support the (?{code}) and (?p{code})
|
||||
constructions. However, there is support for recursive patterns using the
|
||||
non-Perl items (?R), (?number), and (?P>name). Also, the PCRE "callout" feature
|
||||
allows an external function to be called during pattern matching. See the
|
||||
8. Fairly obviously, PCRE does not support the (?{code}) and (??{code})
|
||||
constructions. However, there is support for recursive patterns. This is not
|
||||
available in Perl 5.8, but will be in Perl 5.10. Also, the PCRE "callout"
|
||||
feature allows an external function to be called during pattern matching. See
|
||||
the
|
||||
.\" HREF
|
||||
\fBpcrecallout\fP
|
||||
.\"
|
||||
documentation for details.
|
||||
.P
|
||||
9. There are some differences that are concerned with the settings of captured
|
||||
9. Subpatterns that are called recursively or as "subroutines" are always
|
||||
treated as atomic groups in PCRE. This is like Python, but unlike Perl.
|
||||
.P
|
||||
10. There are some differences that are concerned with the settings of captured
|
||||
strings when part of a pattern is repeated. For example, matching "aba" against
|
||||
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE it is set to "b".
|
||||
.P
|
||||
10. PCRE provides some extensions to the Perl regular expression facilities:
|
||||
11. PCRE does support Perl 5.10's backtracking verbs (*ACCEPT), (*FAIL), (*F),
|
||||
(*COMMIT), (*PRUNE), (*SKIP), and (*THEN), but only in the forms without an
|
||||
argument. PCRE does not support (*MARK). If (*ACCEPT) is within capturing
|
||||
parentheses, PCRE does not set that capture group; this is different to Perl.
|
||||
.P
|
||||
12. PCRE provides some extensions to the Perl regular expression facilities.
|
||||
Perl 5.10 will include new features that are not in earlier versions, some of
|
||||
which (such as named parentheses) have been in PCRE for some time. This list is
|
||||
with respect to Perl 5.10:
|
||||
.sp
|
||||
(a) Although lookbehind assertions must match fixed length strings, each
|
||||
alternative branch of a lookbehind assertion can match a different length of
|
||||
@@ -86,8 +99,8 @@ string. Perl requires them all to have the same length.
|
||||
meta-character matches only at the very end of the string.
|
||||
.sp
|
||||
(c) If PCRE_EXTRA is set, a backslash followed by a letter with no special
|
||||
meaning is faulted. Otherwise, like Perl, the backslash is ignored. (Perl can
|
||||
be made to issue a warning.)
|
||||
meaning is faulted. Otherwise, like Perl, the backslash is quietly ignored.
|
||||
(Perl can be made to issue a warning.)
|
||||
.sp
|
||||
(d) If PCRE_UNGREEDY is set, the greediness of the repetition quantifiers is
|
||||
inverted, that is, by default they are not greedy, but if followed by a
|
||||
@@ -99,28 +112,37 @@ only at the first matching position in the subject string.
|
||||
(f) The PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and PCRE_NO_AUTO_CAPTURE
|
||||
options for \fBpcre_exec()\fP have no Perl equivalents.
|
||||
.sp
|
||||
(g) The (?R), (?number), and (?P>name) constructs allows for recursive pattern
|
||||
matching (Perl can do this using the (?p{code}) construct, which PCRE cannot
|
||||
support.)
|
||||
(g) The \eR escape sequence can be restricted to match only CR, LF, or CRLF
|
||||
by the PCRE_BSR_ANYCRLF option.
|
||||
.sp
|
||||
(h) PCRE supports named capturing substrings, using the Python syntax.
|
||||
(h) The callout facility is PCRE-specific.
|
||||
.sp
|
||||
(i) PCRE supports the possessive quantifier "++" syntax, taken from Sun's Java
|
||||
package.
|
||||
(i) The partial matching facility is PCRE-specific.
|
||||
.sp
|
||||
(j) The (R) condition, for testing recursion, is a PCRE extension.
|
||||
.sp
|
||||
(k) The callout facility is PCRE-specific.
|
||||
.sp
|
||||
(l) The partial matching facility is PCRE-specific.
|
||||
.sp
|
||||
(m) Patterns compiled by PCRE can be saved and re-used at a later time, even on
|
||||
(j) Patterns compiled by PCRE can be saved and re-used at a later time, even on
|
||||
different hosts that have the other endianness.
|
||||
.sp
|
||||
(n) The alternative matching function (\fBpcre_dfa_exec()\fP) matches in a
|
||||
(k) The alternative matching function (\fBpcre_dfa_exec()\fP) matches in a
|
||||
different way and is not Perl-compatible.
|
||||
.P
|
||||
.in 0
|
||||
Last updated: 06 June 2006
|
||||
.br
|
||||
Copyright (c) 1997-2006 University of Cambridge.
|
||||
.sp
|
||||
(l) PCRE recognizes some special sequences such as (*CR) at the start of
|
||||
a pattern that set overall options that cannot be changed within the pattern.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Cambridge CB2 3QH, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 11 September 2007
|
||||
Copyright (c) 1997-2007 University of Cambridge.
|
||||
.fi
|
||||
|
||||
+41
-6
@@ -5,9 +5,7 @@ PCRE - Perl-compatible regular expressions.
|
||||
.rs
|
||||
.sp
|
||||
.B #include <pcrecpp.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
@@ -81,14 +79,42 @@ The function returns true iff all of the following conditions are satisfied:
|
||||
.sp
|
||||
c. The "i"th argument has a suitable type for holding the
|
||||
string captured as the "i"th sub-pattern. If you pass in
|
||||
NULL for the "i"th argument, or pass fewer arguments than
|
||||
void * NULL for the "i"th argument, or a non-void * NULL
|
||||
of the correct type, or pass fewer arguments than the
|
||||
number of sub-patterns, "i"th captured sub-pattern is
|
||||
ignored.
|
||||
.sp
|
||||
CAVEAT: An optional sub-pattern that does not exist in the matched
|
||||
string is assigned the empty string. Therefore, the following will
|
||||
return false (because the empty string is not a valid number):
|
||||
.sp
|
||||
int number;
|
||||
pcrecpp::RE::FullMatch("abc", "[a-z]+(\e\ed+)?", &number);
|
||||
.sp
|
||||
The matching interface supports at most 16 arguments per call.
|
||||
If you need more, consider using the more general interface
|
||||
\fBpcrecpp::RE::DoMatch\fP. See \fBpcrecpp.h\fP for the signature for
|
||||
\fBDoMatch\fP.
|
||||
.P
|
||||
NOTE: Do not use \fBno_arg\fP, which is used internally to mark the end of a
|
||||
list of optional arguments, as a placeholder for missing arguments, as this can
|
||||
lead to segfaults.
|
||||
.
|
||||
.
|
||||
.SH "QUOTING METACHARACTERS"
|
||||
.rs
|
||||
.sp
|
||||
You can use the "QuoteMeta" operation to insert backslashes before all
|
||||
potentially meaningful characters in a string. The returned string, used as a
|
||||
regular expression, will exactly match the original string.
|
||||
.sp
|
||||
Example:
|
||||
string quoted = RE::QuoteMeta(unquoted);
|
||||
.sp
|
||||
Note that it's legal to escape a character even if it has no special meaning in
|
||||
a regular expression -- so this function does that. (This also makes it
|
||||
identical to the perl function of the same name; see "perldoc -f quotemeta".)
|
||||
For example, "1.5-2.0?" becomes "1\e.5\e-2\e.0\e?".
|
||||
.
|
||||
.SH "PARTIAL MATCHES"
|
||||
.rs
|
||||
@@ -307,6 +333,15 @@ string is left unaffected.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
The C++ wrapper was contributed by Google Inc.
|
||||
.br
|
||||
Copyright (c) 2005 Google Inc.
|
||||
Copyright (c) 2007 Google Inc.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 17 March 2009
|
||||
.fi
|
||||
|
||||
+177
-76
@@ -11,10 +11,10 @@ pcregrep - a grep with Perl-compatible regular expressions.
|
||||
grep commands do, but it uses the PCRE regular expression library to support
|
||||
patterns that are compatible with the regular expressions of Perl 5. See
|
||||
.\" HREF
|
||||
\fBpcrepattern\fP
|
||||
\fBpcrepattern\fP(3)
|
||||
.\"
|
||||
for a full description of syntax and semantics of the regular expressions that
|
||||
PCRE supports.
|
||||
for a full description of syntax and semantics of the regular expressions
|
||||
that PCRE supports.
|
||||
.P
|
||||
Patterns, whether supplied on the command line or in a separate file, are given
|
||||
without delimiters. For example:
|
||||
@@ -23,9 +23,9 @@ without delimiters. For example:
|
||||
.sp
|
||||
If you attempt to use delimiters (for example, by surrounding a pattern with
|
||||
slashes, as is common in Perl scripts), they are interpreted as part of the
|
||||
pattern. Quotes can of course be used on the command line because they are
|
||||
interpreted by the shell, and indeed they are required if a pattern contains
|
||||
white space or shell metacharacters.
|
||||
pattern. Quotes can of course be used to delimit patterns on the command line
|
||||
because they are interpreted by the shell, and indeed they are required if a
|
||||
pattern contains white space or shell metacharacters.
|
||||
.P
|
||||
The first argument that follows any option settings is treated as the single
|
||||
pattern to be matched when neither \fB-e\fP nor \fB-f\fP is present.
|
||||
@@ -39,20 +39,53 @@ For example:
|
||||
.sp
|
||||
pcregrep some-pattern /file1 - /file3
|
||||
.sp
|
||||
By default, each line that matches the pattern is copied to the standard
|
||||
By default, each line that matches a pattern is copied to the standard
|
||||
output, and if there is more than one file, the file name is output at the
|
||||
start of each line. However, there are options that can change how
|
||||
\fBpcregrep\fP behaves. In particular, the \fB-M\fP option makes it possible to
|
||||
search for patterns that span line boundaries. What defines a line boundary is
|
||||
controlled by the \fB-N\fP (\fB--newline\fP) option.
|
||||
start of each line, followed by a colon. However, there are options that can
|
||||
change how \fBpcregrep\fP behaves. In particular, the \fB-M\fP option makes it
|
||||
possible to search for patterns that span line boundaries. What defines a line
|
||||
boundary is controlled by the \fB-N\fP (\fB--newline\fP) option.
|
||||
.P
|
||||
Patterns are limited to 8K or BUFSIZ characters, whichever is the greater.
|
||||
BUFSIZ is defined in \fB<stdio.h>\fP.
|
||||
BUFSIZ is defined in \fB<stdio.h>\fP. When there is more than one pattern
|
||||
(specified by the use of \fB-e\fP and/or \fB-f\fP), each pattern is applied to
|
||||
each line in the order in which they are defined, except that all the \fB-e\fP
|
||||
patterns are tried before the \fB-f\fP patterns.
|
||||
.P
|
||||
By default, as soon as one pattern matches (or fails to match when \fB-v\fP is
|
||||
used), no further patterns are considered. However, if \fB--colour\fP (or
|
||||
\fB--color\fP) is used to colour the matching substrings, or if
|
||||
\fB--only-matching\fP, \fB--file-offsets\fP, or \fB--line-offsets\fP is used to
|
||||
output only the part of the line that matched (either shown literally, or as an
|
||||
offset), scanning resumes immediately following the match, so that further
|
||||
matches on the same line can be found. If there are multiple patterns, they are
|
||||
all tried on the remainder of the line, but patterns that follow the one that
|
||||
matched are not tried on the earlier part of the line.
|
||||
.P
|
||||
This is the same behaviour as GNU grep, but it does mean that the order in
|
||||
which multiple patterns are specified can affect the output when one of the
|
||||
above options is used.
|
||||
.P
|
||||
Patterns that can match an empty string are accepted, but empty string
|
||||
matches are not recognized. An example is the pattern "(super)?(man)?", in
|
||||
which all components are optional. This pattern finds all occurrences of both
|
||||
"super" and "man"; the output differs from matching with "super|man" when only
|
||||
the matching substrings are being shown.
|
||||
.P
|
||||
If the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variable is set,
|
||||
\fBpcregrep\fP uses the value to set a locale when calling the PCRE library.
|
||||
The \fB--locale\fP option can be used to override this.
|
||||
.
|
||||
.SH "SUPPORT FOR COMPRESSED FILES"
|
||||
.rs
|
||||
.sp
|
||||
It is possible to compile \fBpcregrep\fP so that it uses \fBlibz\fP or
|
||||
\fBlibbz2\fP to read files whose names end in \fB.gz\fP or \fB.bz2\fP,
|
||||
respectively. You can find out whether your binary has support for one or both
|
||||
of these file types by running it with the \fB--help\fP option. If the
|
||||
appropriate support is not present, files are treated as plain text. The
|
||||
standard input is always so treated.
|
||||
.
|
||||
.SH OPTIONS
|
||||
.rs
|
||||
.TP 10
|
||||
@@ -93,16 +126,20 @@ If data is required, it must be given in the same shell item, separated by an
|
||||
equals sign.
|
||||
.TP
|
||||
\fB--colour=\fP\fIvalue\fP, \fB--color=\fP\fIvalue\fP
|
||||
This option specifies under what circumstances the part of a line that matched
|
||||
a pattern should be coloured in the output. The value may be "never" (the
|
||||
default), "always", or "auto". In the latter case, colouring happens only if
|
||||
the standard output is connected to a terminal. The colour can be specified by
|
||||
setting the environment variable PCREGREP_COLOUR or PCREGREP_COLOR. The value
|
||||
of this variable should be a string of two numbers, separated by a semicolon.
|
||||
They are copied directly into the control string for setting colour on a
|
||||
terminal, so it is your responsibility to ensure that they make sense. If
|
||||
neither of the environment variables is set, the default is "1;31", which gives
|
||||
red.
|
||||
This option specifies under what circumstances the parts of a line that matched
|
||||
a pattern should be coloured in the output. By default, the output is not
|
||||
coloured. The value (which is optional, see above) may be "never", "always", or
|
||||
"auto". In the latter case, colouring happens only if the standard output is
|
||||
connected to a terminal. More resources are used when colouring is enabled,
|
||||
because \fBpcregrep\fP has to search for all possible matches in a line, not
|
||||
just one, in order to colour them all.
|
||||
|
||||
The colour that is used can be specified by setting the environment variable
|
||||
PCREGREP_COLOUR or PCREGREP_COLOR. The value of this variable should be a
|
||||
string of two numbers, separated by a semicolon. They are copied directly into
|
||||
the control string for setting colour on a terminal, so it is your
|
||||
responsibility to ensure that they make sense. If neither of the environment
|
||||
variables is set, the default is "1;31", which gives red.
|
||||
.TP
|
||||
\fB-D\fP \fIaction\fP, \fB--devices=\fP\fIaction\fP
|
||||
If an input path is not a regular file or a directory, "action" specifies how
|
||||
@@ -116,29 +153,41 @@ option), or "skip" (silently skip the path). In the default case, directories
|
||||
are read as if they were ordinary files. In some operating systems the effect
|
||||
of reading a directory like this is an immediate end-of-file.
|
||||
.TP
|
||||
\fB-e\fP \fIpattern\fP, \fB--regex=\fP\fIpattern\fP,
|
||||
\fB--regexp=\fP\fIpattern\fP Specify a pattern to be matched. This option can
|
||||
be used multiple times in order to specify several patterns. It can also be
|
||||
used as a way of specifying a single pattern that starts with a hyphen. When
|
||||
\fB-e\fP is used, no argument pattern is taken from the command line; all
|
||||
arguments are treated as file names. There is an overall maximum of 100
|
||||
patterns. They are applied to each line in the order in which they are defined
|
||||
until one matches (or fails to match if \fB-v\fP is used). If \fB-f\fP is used
|
||||
with \fB-e\fP, the command line patterns are matched first, followed by the
|
||||
patterns from the file, independent of the order in which these options are
|
||||
specified. Note that multiple use of \fB-e\fP is not the same as a single
|
||||
pattern with alternatives. For example, X|Y finds the first character in a line
|
||||
that is X or Y, whereas if the two patterns are given separately,
|
||||
\fBpcregrep\fP finds X if it is present, even if it follows Y in the line. It
|
||||
finds Y only if there is no X in the line. This really matters only if you are
|
||||
using \fB-o\fP to show the portion of the line that matched.
|
||||
\fB-e\fP \fIpattern\fP, \fB--regex=\fP\fIpattern\fP, \fB--regexp=\fP\fIpattern\fP
|
||||
Specify a pattern to be matched. This option can be used multiple times in
|
||||
order to specify several patterns. It can also be used as a way of specifying a
|
||||
single pattern that starts with a hyphen. When \fB-e\fP is used, no argument
|
||||
pattern is taken from the command line; all arguments are treated as file
|
||||
names. There is an overall maximum of 100 patterns. They are applied to each
|
||||
line in the order in which they are defined until one matches (or fails to
|
||||
match if \fB-v\fP is used). If \fB-f\fP is used with \fB-e\fP, the command line
|
||||
patterns are matched first, followed by the patterns from the file, independent
|
||||
of the order in which these options are specified. Note that multiple use of
|
||||
\fB-e\fP is not the same as a single pattern with alternatives. For example,
|
||||
X|Y finds the first character in a line that is X or Y, whereas if the two
|
||||
patterns are given separately, \fBpcregrep\fP finds X if it is present, even if
|
||||
it follows Y in the line. It finds Y only if there is no X in the line. This
|
||||
really matters only if you are using \fB-o\fP to show the part(s) of the line
|
||||
that matched.
|
||||
.TP
|
||||
\fB--exclude\fP=\fIpattern\fP
|
||||
When \fBpcregrep\fP is searching the files in a directory as a consequence of
|
||||
the \fB-r\fP (recursive search) option, any files whose names match the pattern
|
||||
are excluded. The pattern is a PCRE regular expression. If a file name matches
|
||||
both \fB--include\fP and \fB--exclude\fP, it is excluded. There is no short
|
||||
form for this option.
|
||||
the \fB-r\fP (recursive search) option, any regular files whose names match the
|
||||
pattern are excluded. Subdirectories are not excluded by this option; they are
|
||||
searched recursively, subject to the \fB--exclude_dir\fP and
|
||||
\fB--include_dir\fP options. The pattern is a PCRE regular expression, and is
|
||||
matched against the final component of the file name (not the entire path). If
|
||||
a file name matches both \fB--include\fP and \fB--exclude\fP, it is excluded.
|
||||
There is no short form for this option.
|
||||
.TP
|
||||
\fB--exclude_dir\fP=\fIpattern\fP
|
||||
When \fBpcregrep\fP is searching the contents of a directory as a consequence
|
||||
of the \fB-r\fP (recursive search) option, any subdirectories whose names match
|
||||
the pattern are excluded. (Note that the \fP--exclude\fP option does not affect
|
||||
subdirectories.) The pattern is a PCRE regular expression, and is matched
|
||||
against the final component of the name (not the entire path). If a
|
||||
subdirectory name matches both \fB--include_dir\fP and \fB--exclude_dir\fP, it
|
||||
is excluded. There is no short form for this option.
|
||||
.TP
|
||||
\fB-F\fP, \fB--fixed-strings\fP
|
||||
Interpret each pattern as a list of fixed strings, separated by newlines,
|
||||
@@ -156,34 +205,55 @@ present; they are tested before the file's patterns. However, no other pattern
|
||||
is taken from the command line; all arguments are treated as file names. There
|
||||
is an overall maximum of 100 patterns. Trailing white space is removed from
|
||||
each line, and blank lines are ignored. An empty file contains no patterns and
|
||||
therefore matches nothing.
|
||||
therefore matches nothing. See also the comments about multiple patterns versus
|
||||
a single pattern with alternatives in the description of \fB-e\fP above.
|
||||
.TP
|
||||
\fB--file-offsets\fP
|
||||
Instead of showing lines or parts of lines that match, show each match as an
|
||||
offset from the start of the file and a length, separated by a comma. In this
|
||||
mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP
|
||||
options are ignored. If there is more than one match in a line, each of them is
|
||||
shown separately. This option is mutually exclusive with \fB--line-offsets\fP
|
||||
and \fB--only-matching\fP.
|
||||
.TP
|
||||
\fB-H\fP, \fB--with-filename\fP
|
||||
Force the inclusion of the filename at the start of output lines when searching
|
||||
a single file. By default, the filename is not shown in this case. For matching
|
||||
lines, the filename is followed by a colon and a space; for context lines, a
|
||||
hyphen separator is used. If a line number is also being output, it follows the
|
||||
file name without a space.
|
||||
lines, the filename is followed by a colon; for context lines, a hyphen
|
||||
separator is used. If a line number is also being output, it follows the file
|
||||
name.
|
||||
.TP
|
||||
\fB-h\fP, \fB--no-filename\fP
|
||||
Suppress the output filenames when searching multiple files. By default,
|
||||
filenames are shown when multiple files are searched. For matching lines, the
|
||||
filename is followed by a colon and a space; for context lines, a hyphen
|
||||
separator is used. If a line number is also being output, it follows the file
|
||||
name without a space.
|
||||
filename is followed by a colon; for context lines, a hyphen separator is used.
|
||||
If a line number is also being output, it follows the file name.
|
||||
.TP
|
||||
\fB--help\fP
|
||||
Output a brief help message and exit.
|
||||
Output a help message, giving brief details of the command options and file
|
||||
type support, and then exit.
|
||||
.TP
|
||||
\fB-i\fP, \fB--ignore-case\fP
|
||||
Ignore upper/lower case distinctions during comparisons.
|
||||
.TP
|
||||
\fB--include\fP=\fIpattern\fP
|
||||
When \fBpcregrep\fP is searching the files in a directory as a consequence of
|
||||
the \fB-r\fP (recursive search) option, only those files whose names match the
|
||||
pattern are included. The pattern is a PCRE regular expression. If a file name
|
||||
matches both \fB--include\fP and \fB--exclude\fP, it is excluded. There is no
|
||||
short form for this option.
|
||||
the \fB-r\fP (recursive search) option, only those regular files whose names
|
||||
match the pattern are included. Subdirectories are always included and searched
|
||||
recursively, subject to the \fP--include_dir\fP and \fB--exclude_dir\fP
|
||||
options. The pattern is a PCRE regular expression, and is matched against the
|
||||
final component of the file name (not the entire path). If a file name matches
|
||||
both \fB--include\fP and \fB--exclude\fP, it is excluded. There is no short
|
||||
form for this option.
|
||||
.TP
|
||||
\fB--include_dir\fP=\fIpattern\fP
|
||||
When \fBpcregrep\fP is searching the contents of a directory as a consequence
|
||||
of the \fB-r\fP (recursive search) option, only those subdirectories whose
|
||||
names match the pattern are included. (Note that the \fB--include\fP option
|
||||
does not affect subdirectories.) The pattern is a PCRE regular expression, and
|
||||
is matched against the final component of the name (not the entire path). If a
|
||||
subdirectory name matches both \fB--include_dir\fP and \fB--exclude_dir\fP, it
|
||||
is excluded. There is no short form for this option.
|
||||
.TP
|
||||
\fB-L\fP, \fB--files-without-match\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
@@ -201,6 +271,15 @@ This option supplies a name to be used for the standard input when file names
|
||||
are being output. If not supplied, "(standard input)" is used. There is no
|
||||
short form for this option.
|
||||
.TP
|
||||
\fB--line-offsets\fP
|
||||
Instead of showing lines or parts of lines that match, show each match as a
|
||||
line number, the offset from the start of the line, and a length. The line
|
||||
number is terminated by a colon (as usual; see the \fB-n\fP option), and the
|
||||
offset and length are separated by a comma. In this mode, no context is shown.
|
||||
That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. If there is
|
||||
more than one match in a line, each of them is shown separately. This option is
|
||||
mutually exclusive with \fB--file-offsets\fP and \fB--only-matching\fP.
|
||||
.TP
|
||||
\fB--locale\fP=\fIlocale-name\fP
|
||||
This option specifies a locale to be used for pattern matching. It overrides
|
||||
the value in the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variables. If no
|
||||
@@ -220,26 +299,38 @@ the previous 8K characters (or all the previous characters, if fewer than 8K)
|
||||
are guaranteed to be available for lookbehind assertions.
|
||||
.TP
|
||||
\fB-N\fP \fInewline-type\fP, \fB--newline=\fP\fInewline-type\fP
|
||||
The PCRE library supports three different character sequences for indicating
|
||||
The PCRE library supports five different conventions for indicating
|
||||
the ends of lines. They are the single-character sequences CR (carriage return)
|
||||
and LF (linefeed), and the two-character sequence CR, LF. When the library is
|
||||
built, a default line-ending sequence is specified. This is normally the
|
||||
standard sequence for the operating system. Unless otherwise specified by this
|
||||
option, \fBpcregrep\fP uses the default. The possible values for this option
|
||||
are CR, LF, or CRLF. This makes it possible to use \fBpcregrep\fP on files that
|
||||
have come from other environments without having to modify their line endings.
|
||||
If the data that is being scanned does not agree with the convention set by
|
||||
this option, \fBpcregrep\fP may behave in strange ways.
|
||||
and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
|
||||
which recognizes any of the preceding three types, and an "any" convention, in
|
||||
which any Unicode line ending sequence is assumed to end a line. The Unicode
|
||||
sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
|
||||
(formfeed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
|
||||
PS (paragraph separator, U+2029).
|
||||
.sp
|
||||
When the PCRE library is built, a default line-ending sequence is specified.
|
||||
This is normally the standard sequence for the operating system. Unless
|
||||
otherwise specified by this option, \fBpcregrep\fP uses the library's default.
|
||||
The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
|
||||
makes it possible to use \fBpcregrep\fP on files that have come from other
|
||||
environments without having to modify their line endings. If the data that is
|
||||
being scanned does not agree with the convention set by this option,
|
||||
\fBpcregrep\fP may behave in strange ways.
|
||||
.TP
|
||||
\fB-n\fP, \fB--line-number\fP
|
||||
Precede each output line by its line number in the file, followed by a colon
|
||||
and a space for matching lines or a hyphen and a space for context lines. If
|
||||
the filename is also being output, it precedes the line number.
|
||||
for matching lines or a hyphen for context lines. If the filename is also being
|
||||
output, it precedes the line number. This option is forced if
|
||||
\fB--line-offsets\fP is used.
|
||||
.TP
|
||||
\fB-o\fP, \fB--only-matching\fP
|
||||
Show only the part of the line that matched a pattern. In this mode, no
|
||||
context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are
|
||||
ignored.
|
||||
ignored. If there is more than one match in a line, each of them is shown
|
||||
separately. If \fB-o\fP is combined with \fB-v\fP (invert the sense of the
|
||||
match to find non-matching lines), no output is generated, but the return code
|
||||
is set appropriately. This option is mutually exclusive with
|
||||
\fB--file-offsets\fP and \fB--line-offsets\fP.
|
||||
.TP
|
||||
\fB-q\fP, \fB--quiet\fP
|
||||
Work quietly, that is, display nothing except error messages. The exit
|
||||
@@ -274,7 +365,7 @@ the patterns are the ones that are found.
|
||||
Force the patterns to match only whole words. This is equivalent to having \eb
|
||||
at the start and end of the pattern.
|
||||
.TP
|
||||
\fB-x\fP, \fB--line-regex\fP, \fP--line-regexp\fP
|
||||
\fB-x\fP, \fB--line-regex\fP, \fB--line-regexp\fP
|
||||
Force the patterns to be anchored (each must start matching at the beginning of
|
||||
a line) and in addition, require them to match entire lines. This is
|
||||
equivalent to having ^ and $ characters at the start and end of each
|
||||
@@ -339,7 +430,7 @@ in the first form, using an equals character. Otherwise it will be assumed that
|
||||
it has no data.
|
||||
.
|
||||
.
|
||||
.SH MATCHING ERRORS
|
||||
.SH "MATCHING ERRORS"
|
||||
.rs
|
||||
.sp
|
||||
It is possible to supply a regular expression that takes a very long time to
|
||||
@@ -361,16 +452,26 @@ suppress error messages about inaccessble files does not affect the return
|
||||
code.
|
||||
.
|
||||
.
|
||||
.SH "SEE ALSO"
|
||||
.rs
|
||||
.sp
|
||||
\fBpcrepattern\fP(3), \fBpcretest\fP(1).
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
.br
|
||||
University Computing Service
|
||||
.br
|
||||
Cambridge CB2 3QG, England.
|
||||
.P
|
||||
.in 0
|
||||
Last updated: 06 June 2006
|
||||
.br
|
||||
Copyright (c) 1997-2006 University of Cambridge.
|
||||
Cambridge CB2 3QH, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 01 March 2009
|
||||
Copyright (c) 1997-2009 University of Cambridge.
|
||||
.fi
|
||||
|
||||
+251
-138
@@ -14,8 +14,8 @@ DESCRIPTION
|
||||
pcregrep searches files for character patterns, in the same way as
|
||||
other grep commands do, but it uses the PCRE regular expression library
|
||||
to support patterns that are compatible with the regular expressions of
|
||||
Perl 5. See pcrepattern for a full description of syntax and semantics
|
||||
of the regular expressions that PCRE supports.
|
||||
Perl 5. See pcrepattern(3) for a full description of syntax and seman-
|
||||
tics of the regular expressions that PCRE supports.
|
||||
|
||||
Patterns, whether supplied on the command line or in a separate file,
|
||||
are given without delimiters. For example:
|
||||
@@ -24,37 +24,72 @@ DESCRIPTION
|
||||
|
||||
If you attempt to use delimiters (for example, by surrounding a pattern
|
||||
with slashes, as is common in Perl scripts), they are interpreted as
|
||||
part of the pattern. Quotes can of course be used on the command line
|
||||
because they are interpreted by the shell, and indeed they are required
|
||||
if a pattern contains white space or shell metacharacters.
|
||||
part of the pattern. Quotes can of course be used to delimit patterns
|
||||
on the command line because they are interpreted by the shell, and
|
||||
indeed they are required if a pattern contains white space or shell
|
||||
metacharacters.
|
||||
|
||||
The first argument that follows any option settings is treated as the
|
||||
single pattern to be matched when neither -e nor -f is present. Con-
|
||||
versely, when one or both of these options are used to specify pat-
|
||||
The first argument that follows any option settings is treated as the
|
||||
single pattern to be matched when neither -e nor -f is present. Con-
|
||||
versely, when one or both of these options are used to specify pat-
|
||||
terns, all arguments are treated as path names. At least one of -e, -f,
|
||||
or an argument pattern must be provided.
|
||||
|
||||
If no files are specified, pcregrep reads the standard input. The stan-
|
||||
dard input can also be referenced by a name consisting of a single
|
||||
dard input can also be referenced by a name consisting of a single
|
||||
hyphen. For example:
|
||||
|
||||
pcregrep some-pattern /file1 - /file3
|
||||
|
||||
By default, each line that matches the pattern is copied to the stan-
|
||||
dard output, and if there is more than one file, the file name is out-
|
||||
put at the start of each line. However, there are options that can
|
||||
change how pcregrep behaves. In particular, the -M option makes it pos-
|
||||
sible to search for patterns that span line boundaries. What defines a
|
||||
line boundary is controlled by the -N (--newline) option.
|
||||
By default, each line that matches a pattern is copied to the standard
|
||||
output, and if there is more than one file, the file name is output at
|
||||
the start of each line, followed by a colon. However, there are options
|
||||
that can change how pcregrep behaves. In particular, the -M option
|
||||
makes it possible to search for patterns that span line boundaries.
|
||||
What defines a line boundary is controlled by the -N (--newline)
|
||||
option.
|
||||
|
||||
Patterns are limited to 8K or BUFSIZ characters, whichever is the
|
||||
greater. BUFSIZ is defined in <stdio.h>.
|
||||
greater. BUFSIZ is defined in <stdio.h>. When there is more than one
|
||||
pattern (specified by the use of -e and/or -f), each pattern is applied
|
||||
to each line in the order in which they are defined, except that all
|
||||
the -e patterns are tried before the -f patterns.
|
||||
|
||||
If the LC_ALL or LC_CTYPE environment variable is set, pcregrep uses
|
||||
the value to set a locale when calling the PCRE library. The --locale
|
||||
By default, as soon as one pattern matches (or fails to match when -v
|
||||
is used), no further patterns are considered. However, if --colour (or
|
||||
--color) is used to colour the matching substrings, or if --only-match-
|
||||
ing, --file-offsets, or --line-offsets is used to output only the part
|
||||
of the line that matched (either shown literally, or as an offset),
|
||||
scanning resumes immediately following the match, so that further
|
||||
matches on the same line can be found. If there are multiple patterns,
|
||||
they are all tried on the remainder of the line, but patterns that fol-
|
||||
low the one that matched are not tried on the earlier part of the line.
|
||||
|
||||
This is the same behaviour as GNU grep, but it does mean that the order
|
||||
in which multiple patterns are specified can affect the output when one
|
||||
of the above options is used.
|
||||
|
||||
Patterns that can match an empty string are accepted, but empty string
|
||||
matches are not recognized. An example is the pattern "(super)?(man)?",
|
||||
in which all components are optional. This pattern finds all occur-
|
||||
rences of both "super" and "man"; the output differs from matching with
|
||||
"super|man" when only the matching substrings are being shown.
|
||||
|
||||
If the LC_ALL or LC_CTYPE environment variable is set, pcregrep uses
|
||||
the value to set a locale when calling the PCRE library. The --locale
|
||||
option can be used to override this.
|
||||
|
||||
|
||||
SUPPORT FOR COMPRESSED FILES
|
||||
|
||||
It is possible to compile pcregrep so that it uses libz or libbz2 to
|
||||
read files whose names end in .gz or .bz2, respectively. You can find
|
||||
out whether your binary has support for one or both of these file types
|
||||
by running it with the --help option. If the appropriate support is not
|
||||
present, files are treated as plain text. The standard input is always
|
||||
so treated.
|
||||
|
||||
|
||||
OPTIONS
|
||||
|
||||
-- This terminate the list of options. It is useful if the next
|
||||
@@ -99,110 +134,156 @@ OPTIONS
|
||||
the same shell item, separated by an equals sign.
|
||||
|
||||
--colour=value, --color=value
|
||||
This option specifies under what circumstances the part of a
|
||||
This option specifies under what circumstances the parts of a
|
||||
line that matched a pattern should be coloured in the output.
|
||||
The value may be "never" (the default), "always", or "auto".
|
||||
In the latter case, colouring happens only if the standard
|
||||
output is connected to a terminal. The colour can be speci-
|
||||
fied by setting the environment variable PCREGREP_COLOUR or
|
||||
PCREGREP_COLOR. The value of this variable should be a string
|
||||
of two numbers, separated by a semicolon. They are copied
|
||||
directly into the control string for setting colour on a ter-
|
||||
minal, so it is your responsibility to ensure that they make
|
||||
sense. If neither of the environment variables is set, the
|
||||
default is "1;31", which gives red.
|
||||
By default, the output is not coloured. The value (which is
|
||||
optional, see above) may be "never", "always", or "auto". In
|
||||
the latter case, colouring happens only if the standard out-
|
||||
put is connected to a terminal. More resources are used when
|
||||
colouring is enabled, because pcregrep has to search for all
|
||||
possible matches in a line, not just one, in order to colour
|
||||
them all.
|
||||
|
||||
The colour that is used can be specified by setting the envi-
|
||||
ronment variable PCREGREP_COLOUR or PCREGREP_COLOR. The value
|
||||
of this variable should be a string of two numbers, separated
|
||||
by a semicolon. They are copied directly into the control
|
||||
string for setting colour on a terminal, so it is your
|
||||
responsibility to ensure that they make sense. If neither of
|
||||
the environment variables is set, the default is "1;31",
|
||||
which gives red.
|
||||
|
||||
-D action, --devices=action
|
||||
If an input path is not a regular file or a directory,
|
||||
"action" specifies how it is to be processed. Valid values
|
||||
are "read" (the default) or "skip" (silently skip the path).
|
||||
are "read" (the default) or "skip" (silently skip the path).
|
||||
|
||||
-d action, --directories=action
|
||||
If an input path is a directory, "action" specifies how it is
|
||||
to be processed. Valid values are "read" (the default),
|
||||
"recurse" (equivalent to the -r option), or "skip" (silently
|
||||
skip the path). In the default case, directories are read as
|
||||
if they were ordinary files. In some operating systems the
|
||||
effect of reading a directory like this is an immediate end-
|
||||
to be processed. Valid values are "read" (the default),
|
||||
"recurse" (equivalent to the -r option), or "skip" (silently
|
||||
skip the path). In the default case, directories are read as
|
||||
if they were ordinary files. In some operating systems the
|
||||
effect of reading a directory like this is an immediate end-
|
||||
of-file.
|
||||
|
||||
-e pattern, --regex=pattern,
|
||||
--regexp=pattern Specify a pattern to be matched. This option
|
||||
can be used multiple times in order to specify several pat-
|
||||
terns. It can also be used as a way of specifying a single
|
||||
pattern that starts with a hyphen. When -e is used, no argu-
|
||||
ment pattern is taken from the command line; all arguments
|
||||
are treated as file names. There is an overall maximum of 100
|
||||
patterns. They are applied to each line in the order in which
|
||||
they are defined until one matches (or fails to match if -v
|
||||
is used). If -f is used with -e, the command line patterns
|
||||
are matched first, followed by the patterns from the file,
|
||||
independent of the order in which these options are speci-
|
||||
fied. Note that multiple use of -e is not the same as a sin-
|
||||
gle pattern with alternatives. For example, X|Y finds the
|
||||
first character in a line that is X or Y, whereas if the two
|
||||
patterns are given separately, pcregrep finds X if it is
|
||||
present, even if it follows Y in the line. It finds Y only if
|
||||
there is no X in the line. This really matters only if you
|
||||
are using -o to show the portion of the line that matched.
|
||||
-e pattern, --regex=pattern, --regexp=pattern
|
||||
Specify a pattern to be matched. This option can be used mul-
|
||||
tiple times in order to specify several patterns. It can also
|
||||
be used as a way of specifying a single pattern that starts
|
||||
with a hyphen. When -e is used, no argument pattern is taken
|
||||
from the command line; all arguments are treated as file
|
||||
names. There is an overall maximum of 100 patterns. They are
|
||||
applied to each line in the order in which they are defined
|
||||
until one matches (or fails to match if -v is used). If -f is
|
||||
used with -e, the command line patterns are matched first,
|
||||
followed by the patterns from the file, independent of the
|
||||
order in which these options are specified. Note that multi-
|
||||
ple use of -e is not the same as a single pattern with alter-
|
||||
natives. For example, X|Y finds the first character in a line
|
||||
that is X or Y, whereas if the two patterns are given sepa-
|
||||
rately, pcregrep finds X if it is present, even if it follows
|
||||
Y in the line. It finds Y only if there is no X in the line.
|
||||
This really matters only if you are using -o to show the
|
||||
part(s) of the line that matched.
|
||||
|
||||
--exclude=pattern
|
||||
When pcregrep is searching the files in a directory as a con-
|
||||
sequence of the -r (recursive search) option, any files whose
|
||||
names match the pattern are excluded. The pattern is a PCRE
|
||||
regular expression. If a file name matches both --include and
|
||||
--exclude, it is excluded. There is no short form for this
|
||||
sequence of the -r (recursive search) option, any regular
|
||||
files whose names match the pattern are excluded. Subdirecto-
|
||||
ries are not excluded by this option; they are searched
|
||||
recursively, subject to the --exclude_dir and --include_dir
|
||||
options. The pattern is a PCRE regular expression, and is
|
||||
matched against the final component of the file name (not the
|
||||
entire path). If a file name matches both --include and
|
||||
--exclude, it is excluded. There is no short form for this
|
||||
option.
|
||||
|
||||
--exclude_dir=pattern
|
||||
When pcregrep is searching the contents of a directory as a
|
||||
consequence of the -r (recursive search) option, any subdi-
|
||||
rectories whose names match the pattern are excluded. (Note
|
||||
that the --exclude option does not affect subdirectories.)
|
||||
The pattern is a PCRE regular expression, and is matched
|
||||
against the final component of the name (not the entire
|
||||
path). If a subdirectory name matches both --include_dir and
|
||||
--exclude_dir, it is excluded. There is no short form for
|
||||
this option.
|
||||
|
||||
-F, --fixed-strings
|
||||
Interpret each pattern as a list of fixed strings, separated
|
||||
by newlines, instead of as a regular expression. The -w
|
||||
(match as a word) and -x (match whole line) options can be
|
||||
Interpret each pattern as a list of fixed strings, separated
|
||||
by newlines, instead of as a regular expression. The -w
|
||||
(match as a word) and -x (match whole line) options can be
|
||||
used with -F. They apply to each of the fixed strings. A line
|
||||
is selected if any of the fixed strings are found in it (sub-
|
||||
ject to -w or -x, if present).
|
||||
|
||||
-f filename, --file=filename
|
||||
Read a number of patterns from the file, one per line, and
|
||||
match them against each line of input. A data line is output
|
||||
Read a number of patterns from the file, one per line, and
|
||||
match them against each line of input. A data line is output
|
||||
if any of the patterns match it. The filename can be given as
|
||||
"-" to refer to the standard input. When -f is used, patterns
|
||||
specified on the command line using -e may also be present;
|
||||
specified on the command line using -e may also be present;
|
||||
they are tested before the file's patterns. However, no other
|
||||
pattern is taken from the command line; all arguments are
|
||||
treated as file names. There is an overall maximum of 100
|
||||
pattern is taken from the command line; all arguments are
|
||||
treated as file names. There is an overall maximum of 100
|
||||
patterns. Trailing white space is removed from each line, and
|
||||
blank lines are ignored. An empty file contains no patterns
|
||||
and therefore matches nothing.
|
||||
blank lines are ignored. An empty file contains no patterns
|
||||
and therefore matches nothing. See also the comments about
|
||||
multiple patterns versus a single pattern with alternatives
|
||||
in the description of -e above.
|
||||
|
||||
--file-offsets
|
||||
Instead of showing lines or parts of lines that match, show
|
||||
each match as an offset from the start of the file and a
|
||||
length, separated by a comma. In this mode, no context is
|
||||
shown. That is, the -A, -B, and -C options are ignored. If
|
||||
there is more than one match in a line, each of them is shown
|
||||
separately. This option is mutually exclusive with --line-
|
||||
offsets and --only-matching.
|
||||
|
||||
-H, --with-filename
|
||||
Force the inclusion of the filename at the start of output
|
||||
lines when searching a single file. By default, the filename
|
||||
is not shown in this case. For matching lines, the filename
|
||||
is followed by a colon and a space; for context lines, a
|
||||
hyphen separator is used. If a line number is also being out-
|
||||
put, it follows the file name without a space.
|
||||
Force the inclusion of the filename at the start of output
|
||||
lines when searching a single file. By default, the filename
|
||||
is not shown in this case. For matching lines, the filename
|
||||
is followed by a colon; for context lines, a hyphen separator
|
||||
is used. If a line number is also being output, it follows
|
||||
the file name.
|
||||
|
||||
-h, --no-filename
|
||||
Suppress the output filenames when searching multiple files.
|
||||
By default, filenames are shown when multiple files are
|
||||
searched. For matching lines, the filename is followed by a
|
||||
colon and a space; for context lines, a hyphen separator is
|
||||
used. If a line number is also being output, it follows the
|
||||
file name without a space.
|
||||
Suppress the output filenames when searching multiple files.
|
||||
By default, filenames are shown when multiple files are
|
||||
searched. For matching lines, the filename is followed by a
|
||||
colon; for context lines, a hyphen separator is used. If a
|
||||
line number is also being output, it follows the file name.
|
||||
|
||||
--help Output a brief help message and exit.
|
||||
--help Output a help message, giving brief details of the command
|
||||
options and file type support, and then exit.
|
||||
|
||||
-i, --ignore-case
|
||||
Ignore upper/lower case distinctions during comparisons.
|
||||
|
||||
--include=pattern
|
||||
When pcregrep is searching the files in a directory as a con-
|
||||
sequence of the -r (recursive search) option, only those
|
||||
files whose names match the pattern are included. The pattern
|
||||
is a PCRE regular expression. If a file name matches both
|
||||
--include and --exclude, it is excluded. There is no short
|
||||
form for this option.
|
||||
sequence of the -r (recursive search) option, only those reg-
|
||||
ular files whose names match the pattern are included. Subdi-
|
||||
rectories are always included and searched recursively, sub-
|
||||
ject to the --include_dir and --exclude_dir options. The pat-
|
||||
tern is a PCRE regular expression, and is matched against the
|
||||
final component of the file name (not the entire path). If a
|
||||
file name matches both --include and --exclude, it is
|
||||
excluded. There is no short form for this option.
|
||||
|
||||
--include_dir=pattern
|
||||
When pcregrep is searching the contents of a directory as a
|
||||
consequence of the -r (recursive search) option, only those
|
||||
subdirectories whose names match the pattern are included.
|
||||
(Note that the --include option does not affect subdirecto-
|
||||
ries.) The pattern is a PCRE regular expression, and is
|
||||
matched against the final component of the name (not the
|
||||
entire path). If a subdirectory name matches both
|
||||
--include_dir and --exclude_dir, it is excluded. There is no
|
||||
short form for this option.
|
||||
|
||||
-L, --files-without-match
|
||||
Instead of outputting lines from the files, just output the
|
||||
@@ -222,6 +303,17 @@ OPTIONS
|
||||
when file names are being output. If not supplied, "(standard
|
||||
input)" is used. There is no short form for this option.
|
||||
|
||||
--line-offsets
|
||||
Instead of showing lines or parts of lines that match, show
|
||||
each match as a line number, the offset from the start of the
|
||||
line, and a length. The line number is terminated by a colon
|
||||
(as usual; see the -n option), and the offset and length are
|
||||
separated by a comma. In this mode, no context is shown.
|
||||
That is, the -A, -B, and -C options are ignored. If there is
|
||||
more than one match in a line, each of them is shown sepa-
|
||||
rately. This option is mutually exclusive with --file-offsets
|
||||
and --only-matching.
|
||||
|
||||
--locale=locale-name
|
||||
This option specifies a locale to be used for pattern match-
|
||||
ing. It overrides the value in the LC_ALL or LC_CTYPE envi-
|
||||
@@ -245,60 +337,73 @@ OPTIONS
|
||||
lookbehind assertions.
|
||||
|
||||
-N newline-type, --newline=newline-type
|
||||
The PCRE library supports three different character sequences
|
||||
for indicating the ends of lines. They are the single-charac-
|
||||
ter sequences CR (carriage return) and LF (linefeed), and the
|
||||
two-character sequence CR, LF. When the library is built, a
|
||||
default line-ending sequence is specified. This is normally
|
||||
the standard sequence for the operating system. Unless other-
|
||||
wise specified by this option, pcregrep uses the default. The
|
||||
possible values for this option are CR, LF, or CRLF. This
|
||||
makes it possible to use pcregrep on files that have come
|
||||
from other environments without having to modify their line
|
||||
endings. If the data that is being scanned does not agree
|
||||
with the convention set by this option, pcregrep may behave
|
||||
in strange ways.
|
||||
The PCRE library supports five different conventions for
|
||||
indicating the ends of lines. They are the single-character
|
||||
sequences CR (carriage return) and LF (linefeed), the two-
|
||||
character sequence CRLF, an "anycrlf" convention, which rec-
|
||||
ognizes any of the preceding three types, and an "any" con-
|
||||
vention, in which any Unicode line ending sequence is assumed
|
||||
to end a line. The Unicode sequences are the three just men-
|
||||
tioned, plus VT (vertical tab, U+000B), FF (formfeed,
|
||||
U+000C), NEL (next line, U+0085), LS (line separator,
|
||||
U+2028), and PS (paragraph separator, U+2029).
|
||||
|
||||
When the PCRE library is built, a default line-ending
|
||||
sequence is specified. This is normally the standard
|
||||
sequence for the operating system. Unless otherwise specified
|
||||
by this option, pcregrep uses the library's default. The
|
||||
possible values for this option are CR, LF, CRLF, ANYCRLF, or
|
||||
ANY. This makes it possible to use pcregrep on files that
|
||||
have come from other environments without having to modify
|
||||
their line endings. If the data that is being scanned does
|
||||
not agree with the convention set by this option, pcregrep
|
||||
may behave in strange ways.
|
||||
|
||||
-n, --line-number
|
||||
Precede each output line by its line number in the file, fol-
|
||||
lowed by a colon and a space for matching lines or a hyphen
|
||||
and a space for context lines. If the filename is also being
|
||||
output, it precedes the line number.
|
||||
lowed by a colon for matching lines or a hyphen for context
|
||||
lines. If the filename is also being output, it precedes the
|
||||
line number. This option is forced if --line-offsets is used.
|
||||
|
||||
-o, --only-matching
|
||||
Show only the part of the line that matched a pattern. In
|
||||
this mode, no context is shown. That is, the -A, -B, and -C
|
||||
options are ignored.
|
||||
options are ignored. If there is more than one match in a
|
||||
line, each of them is shown separately. If -o is combined
|
||||
with -v (invert the sense of the match to find non-matching
|
||||
lines), no output is generated, but the return code is set
|
||||
appropriately. This option is mutually exclusive with --file-
|
||||
offsets and --line-offsets.
|
||||
|
||||
-q, --quiet
|
||||
Work quietly, that is, display nothing except error messages.
|
||||
The exit status indicates whether or not any matches were
|
||||
The exit status indicates whether or not any matches were
|
||||
found.
|
||||
|
||||
-r, --recursive
|
||||
If any given path is a directory, recursively scan the files
|
||||
it contains, taking note of any --include and --exclude set-
|
||||
tings. By default, a directory is read as a normal file; in
|
||||
some operating systems this gives an immediate end-of-file.
|
||||
This option is a shorthand for setting the -d option to
|
||||
If any given path is a directory, recursively scan the files
|
||||
it contains, taking note of any --include and --exclude set-
|
||||
tings. By default, a directory is read as a normal file; in
|
||||
some operating systems this gives an immediate end-of-file.
|
||||
This option is a shorthand for setting the -d option to
|
||||
"recurse".
|
||||
|
||||
-s, --no-messages
|
||||
Suppress error messages about non-existent or unreadable
|
||||
files. Such files are quietly skipped. However, the return
|
||||
Suppress error messages about non-existent or unreadable
|
||||
files. Such files are quietly skipped. However, the return
|
||||
code is still 2, even if matches were found in other files.
|
||||
|
||||
-u, --utf-8
|
||||
Operate in UTF-8 mode. This option is available only if PCRE
|
||||
has been compiled with UTF-8 support. Both patterns and sub-
|
||||
Operate in UTF-8 mode. This option is available only if PCRE
|
||||
has been compiled with UTF-8 support. Both patterns and sub-
|
||||
ject lines must be valid strings of UTF-8 characters.
|
||||
|
||||
-V, --version
|
||||
Write the version numbers of pcregrep and the PCRE library
|
||||
Write the version numbers of pcregrep and the PCRE library
|
||||
that is being used to the standard error stream.
|
||||
|
||||
-v, --invert-match
|
||||
Invert the sense of the match, so that lines which do not
|
||||
Invert the sense of the match, so that lines which do not
|
||||
match any of the patterns are the ones that are found.
|
||||
|
||||
-w, --word-regex, --word-regexp
|
||||
@@ -306,61 +411,61 @@ OPTIONS
|
||||
lent to having \b at the start and end of the pattern.
|
||||
|
||||
-x, --line-regex, --line-regexp
|
||||
Force the patterns to be anchored (each must start matching
|
||||
at the beginning of a line) and in addition, require them to
|
||||
match entire lines. This is equivalent to having ^ and $
|
||||
Force the patterns to be anchored (each must start matching
|
||||
at the beginning of a line) and in addition, require them to
|
||||
match entire lines. This is equivalent to having ^ and $
|
||||
characters at the start and end of each alternative branch in
|
||||
every pattern.
|
||||
|
||||
|
||||
ENVIRONMENT VARIABLES
|
||||
|
||||
The environment variables LC_ALL and LC_CTYPE are examined, in that
|
||||
order, for a locale. The first one that is set is used. This can be
|
||||
overridden by the --locale option. If no locale is set, the PCRE
|
||||
The environment variables LC_ALL and LC_CTYPE are examined, in that
|
||||
order, for a locale. The first one that is set is used. This can be
|
||||
overridden by the --locale option. If no locale is set, the PCRE
|
||||
library's default (usually the "C" locale) is used.
|
||||
|
||||
|
||||
NEWLINES
|
||||
|
||||
The -N (--newline) option allows pcregrep to scan files with different
|
||||
newline conventions from the default. However, the setting of this
|
||||
option does not affect the way in which pcregrep writes information to
|
||||
the standard error and output streams. It uses the string "\n" in C
|
||||
printf() calls to indicate newlines, relying on the C I/O library to
|
||||
convert this to an appropriate sequence if the output is sent to a
|
||||
The -N (--newline) option allows pcregrep to scan files with different
|
||||
newline conventions from the default. However, the setting of this
|
||||
option does not affect the way in which pcregrep writes information to
|
||||
the standard error and output streams. It uses the string "\n" in C
|
||||
printf() calls to indicate newlines, relying on the C I/O library to
|
||||
convert this to an appropriate sequence if the output is sent to a
|
||||
file.
|
||||
|
||||
|
||||
OPTIONS COMPATIBILITY
|
||||
|
||||
The majority of short and long forms of pcregrep's options are the same
|
||||
as in the GNU grep program. Any long option of the form --xxx-regexp
|
||||
(GNU terminology) is also available as --xxx-regex (PCRE terminology).
|
||||
However, the --locale, -M, --multiline, -u, and --utf-8 options are
|
||||
as in the GNU grep program. Any long option of the form --xxx-regexp
|
||||
(GNU terminology) is also available as --xxx-regex (PCRE terminology).
|
||||
However, the --locale, -M, --multiline, -u, and --utf-8 options are
|
||||
specific to pcregrep.
|
||||
|
||||
|
||||
OPTIONS WITH DATA
|
||||
|
||||
There are four different ways in which an option with data can be spec-
|
||||
ified. If a short form option is used, the data may follow immedi-
|
||||
ified. If a short form option is used, the data may follow immedi-
|
||||
ately, or in the next command line item. For example:
|
||||
|
||||
-f/some/file
|
||||
-f /some/file
|
||||
|
||||
If a long form option is used, the data may appear in the same command
|
||||
If a long form option is used, the data may appear in the same command
|
||||
line item, separated by an equals character, or (with one exception) it
|
||||
may appear in the next command line item. For example:
|
||||
|
||||
--file=/some/file
|
||||
--file /some/file
|
||||
|
||||
Note, however, that if you want to supply a file name beginning with ~
|
||||
as data in a shell command, and have the shell expand ~ to a home
|
||||
Note, however, that if you want to supply a file name beginning with ~
|
||||
as data in a shell command, and have the shell expand ~ to a home
|
||||
directory, you must separate the file name from the option, because the
|
||||
shell does not treat ~ specially unless it is at the start of an item.
|
||||
shell does not treat ~ specially unless it is at the start of an item.
|
||||
|
||||
The exception to the above is the --colour (or --color) option, for
|
||||
which the data is optional. If this option does have data, it must be
|
||||
@@ -389,11 +494,19 @@ DIAGNOSTICS
|
||||
not affect the return code.
|
||||
|
||||
|
||||
SEE ALSO
|
||||
|
||||
pcrepattern(3), pcretest(1).
|
||||
|
||||
|
||||
AUTHOR
|
||||
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Cambridge CB2 3QG, England.
|
||||
Cambridge CB2 3QH, England.
|
||||
|
||||
Last updated: 06 June 2006
|
||||
Copyright (c) 1997-2006 University of Cambridge.
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 01 March 2009
|
||||
Copyright (c) 1997-2009 University of Cambridge.
|
||||
|
||||
@@ -26,7 +26,7 @@ is matched against the string
|
||||
<something> <something else> <something further>
|
||||
.sp
|
||||
there are three possible answers. The standard algorithm finds only one of
|
||||
them, whereas the DFA algorithm finds all three.
|
||||
them, whereas the alternative algorithm finds all three.
|
||||
.
|
||||
.SH "REGULAR EXPRESSIONS AS TREES"
|
||||
.rs
|
||||
@@ -41,8 +41,8 @@ correspond to the two matching algorithms provided by PCRE.
|
||||
.SH "THE STANDARD MATCHING ALGORITHM"
|
||||
.rs
|
||||
.sp
|
||||
In the terminology of Jeffrey Friedl's book \fIMastering Regular
|
||||
Expressions\fP, the standard algorithm is an "NFA algorithm". It conducts a
|
||||
In the terminology of Jeffrey Friedl's book "Mastering Regular
|
||||
Expressions", the standard algorithm is an "NFA algorithm". It conducts a
|
||||
depth-first search of the pattern tree. That is, it proceeds along a single
|
||||
path through the tree, checking that the subject matches what is required. When
|
||||
there is a mismatch, the algorithm tries any alternatives at the current point,
|
||||
@@ -63,15 +63,16 @@ straightforward for this algorithm to keep track of the substrings that are
|
||||
matched by portions of the pattern in parentheses. This provides support for
|
||||
capturing parentheses and back references.
|
||||
.
|
||||
.SH "THE DFA MATCHING ALGORITHM"
|
||||
.SH "THE ALTERNATIVE MATCHING ALGORITHM"
|
||||
.rs
|
||||
.sp
|
||||
DFA stands for "deterministic finite automaton", but you do not need to
|
||||
understand the origins of that name. This algorithm conducts a breadth-first
|
||||
search of the tree. Starting from the first matching point in the subject, it
|
||||
scans the subject string from left to right, once, character by character, and
|
||||
as it does this, it remembers all the paths through the tree that represent
|
||||
valid matches.
|
||||
This algorithm conducts a breadth-first search of the tree. Starting from the
|
||||
first matching point in the subject, it scans the subject string from left to
|
||||
right, once, character by character, and as it does this, it remembers all the
|
||||
paths through the tree that represent valid matches. In Friedl's terminology,
|
||||
this is a kind of "DFA algorithm", though it is not implemented as a
|
||||
traditional finite state machine (it keeps multiple states active
|
||||
simultaneously).
|
||||
.P
|
||||
The scan continues until either the end of the subject is reached, or there are
|
||||
no more unterminated paths. At this point, terminated paths represent the
|
||||
@@ -92,11 +93,20 @@ character of the subject. The algorithm does not automatically move on to find
|
||||
matches that start at later positions.
|
||||
.P
|
||||
There are a number of features of PCRE regular expressions that are not
|
||||
supported by the DFA matching algorithm. They are as follows:
|
||||
supported by the alternative matching algorithm. They are as follows:
|
||||
.P
|
||||
1. Because the algorithm finds all possible matches, the greedy or ungreedy
|
||||
nature of repetition quantifiers is not relevant. Greedy and ungreedy
|
||||
quantifiers are treated in exactly the same way.
|
||||
quantifiers are treated in exactly the same way. However, possessive
|
||||
quantifiers can make a difference when what follows could also match what is
|
||||
quantified, for example in a pattern like this:
|
||||
.sp
|
||||
^a++\ew!
|
||||
.sp
|
||||
This pattern matches "aaab!" but not "aaa!", which would be matched by a
|
||||
non-possessive quantifier. Similarly, if an atomic group is present, it is
|
||||
matched as if it were a standalone pattern at the current point, and the
|
||||
longest match is then "locked in" for the rest of the overall pattern.
|
||||
.P
|
||||
2. When dealing with multiple paths through the tree simultaneously, it is not
|
||||
straightforward to keep track of captured substrings for the different matching
|
||||
@@ -107,21 +117,27 @@ do this. This means that no captured substrings are available.
|
||||
not supported, and cause errors if encountered.
|
||||
.P
|
||||
4. For the same reason, conditional expressions that use a backreference as the
|
||||
condition are not supported.
|
||||
condition or test for a specific group recursion are not supported.
|
||||
.P
|
||||
5. Callouts are supported, but the value of the \fIcapture_top\fP field is
|
||||
5. Because many paths through the tree may be active, the \eK escape sequence,
|
||||
which resets the start of the match when encountered (but may be on some paths
|
||||
and not on others), is not supported. It causes an error if encountered.
|
||||
.P
|
||||
6. Callouts are supported, but the value of the \fIcapture_top\fP field is
|
||||
always 1, and the value of the \fIcapture_last\fP field is always -1.
|
||||
.P
|
||||
6.
|
||||
The \eC escape sequence, which (in the standard algorithm) matches a single
|
||||
byte, even in UTF-8 mode, is not supported because the DFA algorithm moves
|
||||
through the subject string one character at a time, for all active paths
|
||||
7. The \eC escape sequence, which (in the standard algorithm) matches a single
|
||||
byte, even in UTF-8 mode, is not supported because the alternative algorithm
|
||||
moves through the subject string one character at a time, for all active paths
|
||||
through the tree.
|
||||
.P
|
||||
8. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not
|
||||
supported. (*FAIL) is supported, and behaves like a failing negative assertion.
|
||||
.
|
||||
.SH "ADVANTAGES OF THE DFA ALGORITHM"
|
||||
.SH "ADVANTAGES OF THE ALTERNATIVE ALGORITHM"
|
||||
.rs
|
||||
.sp
|
||||
Using the DFA matching algorithm provides the following advantages:
|
||||
Using the alternative matching algorithm provides the following advantages:
|
||||
.P
|
||||
1. All possible matches (at a single point in the subject) are automatically
|
||||
found, and in particular, the longest match is found. To find more than one
|
||||
@@ -130,17 +146,18 @@ callouts.
|
||||
.P
|
||||
2. There is much better support for partial matching. The restrictions on the
|
||||
content of the pattern that apply when using the standard algorithm for partial
|
||||
matching do not apply to the DFA algorithm. For non-anchored patterns, the
|
||||
starting position of a partial match is available.
|
||||
matching do not apply to the alternative algorithm. For non-anchored patterns,
|
||||
the starting position of a partial match is available.
|
||||
.P
|
||||
3. Because the DFA algorithm scans the subject string just once, and never
|
||||
needs to backtrack, it is possible to pass very long subject strings to the
|
||||
matching function in several pieces, checking for partial matching each time.
|
||||
3. Because the alternative algorithm scans the subject string just once, and
|
||||
never needs to backtrack, it is possible to pass very long subject strings to
|
||||
the matching function in several pieces, checking for partial matching each
|
||||
time.
|
||||
.
|
||||
.SH "DISADVANTAGES OF THE DFA ALGORITHM"
|
||||
.SH "DISADVANTAGES OF THE ALTERNATIVE ALGORITHM"
|
||||
.rs
|
||||
.sp
|
||||
The DFA algorithm suffers from a number of disadvantages:
|
||||
The alternative algorithm suffers from a number of disadvantages:
|
||||
.P
|
||||
1. It is substantially slower than the standard algorithm. This is partly
|
||||
because it has to search for all possible matches, but is also because it is
|
||||
@@ -148,10 +165,24 @@ less susceptible to optimization.
|
||||
.P
|
||||
2. Capturing parentheses and back references are not supported.
|
||||
.P
|
||||
3. The "atomic group" feature of PCRE regular expressions is supported, but
|
||||
does not provide the advantage that it does for the standard algorithm.
|
||||
.P
|
||||
.in 0
|
||||
Last updated: 06 June 2006
|
||||
.br
|
||||
Copyright (c) 1997-2006 University of Cambridge.
|
||||
3. Although atomic groups are supported, their use does not provide the
|
||||
performance advantage that it does for the standard algorithm.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Cambridge CB2 3QH, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 19 April 2008
|
||||
Copyright (c) 1997-2008 University of Cambridge.
|
||||
.fi
|
||||
|
||||
+35
-19
@@ -71,6 +71,8 @@ envisaged for this facility, this is not felt to be a major restriction.
|
||||
.P
|
||||
If PCRE_PARTIAL is set for a pattern that does not conform to the restrictions,
|
||||
\fBpcre_exec()\fP returns the error code PCRE_ERROR_BADPARTIAL (-13).
|
||||
You can use the PCRE_INFO_OKPARTIAL call to \fBpcre_fullinfo()\fP to find out
|
||||
if a compiled pattern can be used for partial matching.
|
||||
.
|
||||
.
|
||||
.SH "EXAMPLE OF PARTIAL MATCHING USING PCRETEST"
|
||||
@@ -95,10 +97,11 @@ uses the date example quoted above:
|
||||
.sp
|
||||
The first data string is matched completely, so \fBpcretest\fP shows the
|
||||
matched substrings. The remaining four strings do not match the complete
|
||||
pattern, but the first two are partial matches. The same test, using DFA
|
||||
matching (by means of the \eD escape sequence), produces the following output:
|
||||
pattern, but the first two are partial matches. The same test, using
|
||||
\fBpcre_dfa_exec()\fP matching (by means of the \eD escape sequence), produces
|
||||
the following output:
|
||||
.sp
|
||||
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||
re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
|
||||
data> 25jun04\eP\eD
|
||||
0: 25jun04
|
||||
data> 23dec3\eP\eD
|
||||
@@ -119,13 +122,13 @@ available.
|
||||
.sp
|
||||
When a partial match has been found using \fBpcre_dfa_exec()\fP, it is possible
|
||||
to continue the match by providing additional subject data and calling
|
||||
\fBpcre_dfa_exec()\fP again with the PCRE_DFA_RESTART option and the same
|
||||
working space (where details of the previous partial match are stored). Here is
|
||||
an example using \fBpcretest\fP, where the \eR escape sequence sets the
|
||||
PCRE_DFA_RESTART option and the \eD escape sequence requests the use of
|
||||
\fBpcre_dfa_exec()\fP:
|
||||
\fBpcre_dfa_exec()\fP again with the same compiled regular expression, this
|
||||
time setting the PCRE_DFA_RESTART option. You must also pass the same working
|
||||
space as before, because this is where details of the previous partial match
|
||||
are stored. Here is an example using \fBpcretest\fP, using the \eR escape
|
||||
sequence to set the PCRE_DFA_RESTART option (\eP and \eD are as above):
|
||||
.sp
|
||||
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||
re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
|
||||
data> 23ja\eP\eD
|
||||
Partial match: 23ja
|
||||
data> n05\eR\eD
|
||||
@@ -137,9 +140,10 @@ Notice that when the match is complete, only the last part is shown; PCRE does
|
||||
not retain the previously partially-matched string. It is up to the calling
|
||||
program to do that if it needs to.
|
||||
.P
|
||||
This facility can be used to pass very long subject strings to
|
||||
\fBpcre_dfa_exec()\fP. However, some care is needed for certain types of
|
||||
pattern.
|
||||
You can set PCRE_PARTIAL with PCRE_DFA_RESTART to continue partial matching
|
||||
over multiple segments. This facility can be used to pass very long subject
|
||||
strings to \fBpcre_dfa_exec()\fP. However, some care is needed for certain
|
||||
types of pattern.
|
||||
.P
|
||||
1. If the pattern contains tests for the beginning or end of a line, you need
|
||||
to pass the PCRE_NOTBOL or PCRE_NOTEOL options, as appropriate, when the
|
||||
@@ -147,7 +151,7 @@ subject string for any call does not contain the beginning or end of a line.
|
||||
.P
|
||||
2. If the pattern contains backward assertions (including \eb or \eB), you need
|
||||
to arrange for some overlap in the subject strings to allow for this. For
|
||||
example, you could pass the subject in chunks that were 500 bytes long, but in
|
||||
example, you could pass the subject in chunks that are 500 bytes long, but in
|
||||
a buffer of 700 bytes, with the starting offset set to 200 and the previous 200
|
||||
bytes at the start of the buffer.
|
||||
.P
|
||||
@@ -155,7 +159,7 @@ bytes at the start of the buffer.
|
||||
always produce exactly the same result as matching over one single long string.
|
||||
The difference arises when there are multiple matching possibilities, because a
|
||||
partial match result is given only when there are no completed matches in a
|
||||
call to fBpcre_dfa_exec()\fP. This means that as soon as the shortest match has
|
||||
call to \fBpcre_dfa_exec()\fP. This means that as soon as the shortest match has
|
||||
been found, continuation to a new subject segment is no longer possible.
|
||||
Consider this \fBpcretest\fP example:
|
||||
.sp
|
||||
@@ -196,8 +200,20 @@ patterns or patterns such as:
|
||||
where no string can be a partial match for both alternatives.
|
||||
.
|
||||
.
|
||||
.P
|
||||
.in 0
|
||||
Last updated: 16 January 2006
|
||||
.br
|
||||
Copyright (c) 1997-2006 University of Cambridge.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Cambridge CB2 3QH, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 04 June 2007
|
||||
Copyright (c) 1997-2007 University of Cambridge.
|
||||
.fi
|
||||
|
||||
+787
-173
File diff suppressed because it is too large
Load Diff
+92
-15
@@ -4,13 +4,75 @@ PCRE - Perl-compatible regular expressions
|
||||
.SH "PCRE PERFORMANCE"
|
||||
.rs
|
||||
.sp
|
||||
Certain items that may appear in regular expression patterns are more efficient
|
||||
Two aspects of performance are discussed below: memory usage and processing
|
||||
time. The way you express your pattern as a regular expression can affect both
|
||||
of them.
|
||||
.
|
||||
.SH "MEMORY USAGE"
|
||||
.rs
|
||||
.sp
|
||||
Patterns are compiled by PCRE into a reasonably efficient byte code, so that
|
||||
most simple patterns do not use much memory. However, there is one case where
|
||||
memory usage can be unexpectedly large. When a parenthesized subpattern has a
|
||||
quantifier with a minimum greater than 1 and/or a limited maximum, the whole
|
||||
subpattern is repeated in the compiled code. For example, the pattern
|
||||
.sp
|
||||
(abc|def){2,4}
|
||||
.sp
|
||||
is compiled as if it were
|
||||
.sp
|
||||
(abc|def)(abc|def)((abc|def)(abc|def)?)?
|
||||
.sp
|
||||
(Technical aside: It is done this way so that backtrack points within each of
|
||||
the repetitions can be independently maintained.)
|
||||
.P
|
||||
For regular expressions whose quantifiers use only small numbers, this is not
|
||||
usually a problem. However, if the numbers are large, and particularly if such
|
||||
repetitions are nested, the memory usage can become an embarrassment. For
|
||||
example, the very simple pattern
|
||||
.sp
|
||||
((ab){1,1000}c){1,3}
|
||||
.sp
|
||||
uses 51K bytes when compiled. When PCRE is compiled with its default internal
|
||||
pointer size of two bytes, the size limit on a compiled pattern is 64K, and
|
||||
this is reached with the above pattern if the outer repetition is increased
|
||||
from 3 to 4. PCRE can be compiled to use larger internal pointers and thus
|
||||
handle larger compiled patterns, but it is better to try to rewrite your
|
||||
pattern to use less memory if you can.
|
||||
.P
|
||||
One way of reducing the memory usage for such patterns is to make use of PCRE's
|
||||
.\" HTML <a href="pcrepattern.html#subpatternsassubroutines">
|
||||
.\" </a>
|
||||
"subroutine"
|
||||
.\"
|
||||
facility. Re-writing the above pattern as
|
||||
.sp
|
||||
((ab)(?2){0,999}c)(?1){0,2}
|
||||
.sp
|
||||
reduces the memory requirements to 18K, and indeed it remains under 20K even
|
||||
with the outer repetition increased to 100. However, this pattern is not
|
||||
exactly equivalent, because the "subroutine" calls are treated as
|
||||
.\" HTML <a href="pcrepattern.html#atomicgroup">
|
||||
.\" </a>
|
||||
atomic groups
|
||||
.\"
|
||||
into which there can be no backtracking if there is a subsequent matching
|
||||
failure. Therefore, PCRE cannot do this kind of rewriting automatically.
|
||||
Furthermore, there is a noticeable loss of speed when executing the modified
|
||||
pattern. Nevertheless, if the atomic grouping is not a problem and the loss of
|
||||
speed is acceptable, this kind of rewriting will allow you to process patterns
|
||||
that PCRE cannot otherwise handle.
|
||||
.
|
||||
.SH "PROCESSING TIME"
|
||||
.rs
|
||||
.sp
|
||||
Certain items in regular expression patterns are processed more efficiently
|
||||
than others. It is more efficient to use a character class like [aeiou] than a
|
||||
set of alternatives such as (a|e|i|o|u). In general, the simplest construction
|
||||
that provides the required behaviour is usually the most efficient. Jeffrey
|
||||
Friedl's book contains a lot of useful general discussion about optimizing
|
||||
regular expressions for efficient performance. This document contains a few
|
||||
observations about PCRE.
|
||||
set of single-character alternatives such as (a|e|i|o|u). In general, the
|
||||
simplest construction that provides the required behaviour is usually the most
|
||||
efficient. Jeffrey Friedl's book contains a lot of useful general discussion
|
||||
about optimizing regular expressions for efficient performance. This document
|
||||
contains a few observations about PCRE.
|
||||
.P
|
||||
Using Unicode character properties (the \ep, \eP, and \eX escapes) is slow,
|
||||
because PCRE has to scan a structure that contains data for over fifteen
|
||||
@@ -42,14 +104,15 @@ Beware of patterns that contain nested indefinite repeats. These can take a
|
||||
long time to run when applied to a string that does not match. Consider the
|
||||
pattern fragment
|
||||
.sp
|
||||
(a+)*
|
||||
^(a+)*
|
||||
.sp
|
||||
This can match "aaaa" in 33 different ways, and this number increases very
|
||||
This can match "aaaa" in 16 different ways, and this number increases very
|
||||
rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4
|
||||
times, and for each of those cases other than 0, the + repeats can match
|
||||
times, and for each of those cases other than 0 or 4, the + repeats can match
|
||||
different numbers of times.) When the remainder of the pattern is such that the
|
||||
entire match is going to fail, PCRE has in principle to try every possible
|
||||
variation, and this can take an extremely long time.
|
||||
variation, and this can take an extremely long time, even for relatively short
|
||||
strings.
|
||||
.P
|
||||
An optimization catches some of the more simple cases such as
|
||||
.sp
|
||||
@@ -69,8 +132,22 @@ appreciable time with strings longer than about 20 characters.
|
||||
.P
|
||||
In many cases, the solution to this kind of performance issue is to use an
|
||||
atomic group or a possessive quantifier.
|
||||
.P
|
||||
.in 0
|
||||
Last updated: 28 February 2005
|
||||
.br
|
||||
Copyright (c) 1997-2005 University of Cambridge.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Cambridge CB2 3QH, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 06 March 2007
|
||||
Copyright (c) 1997-2007 University of Cambridge.
|
||||
.fi
|
||||
|
||||
+38
-20
@@ -7,22 +7,18 @@ PCRE - Perl-compatible regular expressions.
|
||||
.B #include <pcreposix.h>
|
||||
.PP
|
||||
.SM
|
||||
.br
|
||||
.B int regcomp(regex_t *\fIpreg\fP, const char *\fIpattern\fP,
|
||||
.ti +5n
|
||||
.B int \fIcflags\fP);
|
||||
.PP
|
||||
.br
|
||||
.B int regexec(regex_t *\fIpreg\fP, const char *\fIstring\fP,
|
||||
.ti +5n
|
||||
.B size_t \fInmatch\fP, regmatch_t \fIpmatch\fP[], int \fIeflags\fP);
|
||||
.PP
|
||||
.br
|
||||
.B size_t regerror(int \fIerrcode\fP, const regex_t *\fIpreg\fP,
|
||||
.ti +5n
|
||||
.B char *\fIerrbuf\fP, size_t \fIerrbuf_size\fP);
|
||||
.PP
|
||||
.br
|
||||
.B void regfree(regex_t *\fIpreg\fP);
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
@@ -43,11 +39,11 @@ header file, and on Unix systems the library itself is called
|
||||
command for linking an application that uses them. Because the POSIX functions
|
||||
call the native ones, it is also necessary to add \fB-lpcre\fP.
|
||||
.P
|
||||
I have implemented only those option bits that can be reasonably mapped to PCRE
|
||||
native options. In addition, the option REG_EXTENDED is defined with the value
|
||||
zero. This has no effect, but since programs that are written to the POSIX
|
||||
interface often use it, this makes it easier to slot in PCRE as a replacement
|
||||
library. Other POSIX options are not even defined.
|
||||
I have implemented only those POSIX option bits that can be reasonably mapped
|
||||
to PCRE native options. In addition, the option REG_EXTENDED is defined with
|
||||
the value zero. This has no effect, but since programs that are written to the
|
||||
POSIX interface often use it, this makes it easier to slot in PCRE as a
|
||||
replacement library. Other POSIX options are not even defined.
|
||||
.P
|
||||
When PCRE is called via these functions, it is only the API that is POSIX-like
|
||||
in style. The syntax and semantics of the regular expressions themselves are
|
||||
@@ -161,18 +157,36 @@ REG_NEWLINE action.
|
||||
.rs
|
||||
.sp
|
||||
The function \fBregexec()\fP is called to match a compiled pattern \fIpreg\fP
|
||||
against a given \fIstring\fP, which is terminated by a zero byte, subject to
|
||||
the options in \fIeflags\fP. These can be:
|
||||
against a given \fIstring\fP, which is by default terminated by a zero byte
|
||||
(but see REG_STARTEND below), subject to the options in \fIeflags\fP. These can
|
||||
be:
|
||||
.sp
|
||||
REG_NOTBOL
|
||||
.sp
|
||||
The PCRE_NOTBOL option is set when calling the underlying PCRE matching
|
||||
function.
|
||||
.sp
|
||||
REG_NOTEMPTY
|
||||
.sp
|
||||
The PCRE_NOTEMPTY option is set when calling the underlying PCRE matching
|
||||
function. Note that REG_NOTEMPTY is not part of the POSIX standard. However,
|
||||
setting this option can give more POSIX-like behaviour in some situations.
|
||||
.sp
|
||||
REG_NOTEOL
|
||||
.sp
|
||||
The PCRE_NOTEOL option is set when calling the underlying PCRE matching
|
||||
function.
|
||||
.sp
|
||||
REG_STARTEND
|
||||
.sp
|
||||
The string is considered to start at \fIstring\fP + \fIpmatch[0].rm_so\fP and
|
||||
to have a terminating NUL located at \fIstring\fP + \fIpmatch[0].rm_eo\fP
|
||||
(there need not actually be a NUL at that location), regardless of the value of
|
||||
\fInmatch\fP. This is a BSD extension, compatible with but not specified by
|
||||
IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
|
||||
intended to be portable to other systems. Note that a non-zero \fIrm_so\fP does
|
||||
not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
|
||||
how it is matched.
|
||||
.P
|
||||
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
|
||||
strings is returned. The \fInmatch\fP and \fIpmatch\fP arguments of
|
||||
@@ -214,13 +228,17 @@ memory, after which \fIpreg\fP may no longer be used as a compiled expression.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
.br
|
||||
University Computing Service,
|
||||
.br
|
||||
Cambridge CB2 3QG, England.
|
||||
.P
|
||||
.in 0
|
||||
Last updated: 16 January 2006
|
||||
.br
|
||||
Copyright (c) 1997-2006 University of Cambridge.
|
||||
University Computing Service
|
||||
Cambridge CB2 3QH, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 11 March 2009
|
||||
Copyright (c) 1997-2009 University of Cambridge.
|
||||
.fi
|
||||
|
||||
@@ -17,7 +17,9 @@ tables, it is a little bit more complicated.
|
||||
If you save compiled patterns to a file, you can copy them to a different host
|
||||
and run them there. This works even if the new host has the opposite endianness
|
||||
to the one on which the patterns were compiled. There may be a small
|
||||
performance penalty, but it should be insignificant.
|
||||
performance penalty, but it should be insignificant. However, compiling regular
|
||||
expressions with one version of PCRE for use with a different version is not
|
||||
guaranteed to work and may cause crashes.
|
||||
.
|
||||
.
|
||||
.SH "SAVING A COMPILED PATTERN"
|
||||
@@ -115,17 +117,26 @@ usual way.
|
||||
.SH "COMPATIBILITY WITH DIFFERENT PCRE RELEASES"
|
||||
.rs
|
||||
.sp
|
||||
The layout of the control block that is at the start of the data that makes up
|
||||
a compiled pattern was changed for release 5.0. If you have any saved patterns
|
||||
that were compiled with previous releases (not a facility that was previously
|
||||
advertised), you will have to recompile them for release 5.0. However, from now
|
||||
on, it should be possible to make changes in a compatible manner.
|
||||
.P
|
||||
Notwithstanding the above, if you have any saved patterns in UTF-8 mode that
|
||||
use \ep or \eP that were compiled with any release up to and including 6.4, you
|
||||
will have to recompile them for release 6.5 and above.
|
||||
.P
|
||||
.in 0
|
||||
Last updated: 01 February 2006
|
||||
.br
|
||||
Copyright (c) 1997-2006 University of Cambridge.
|
||||
In general, it is safest to recompile all saved patterns when you update to a
|
||||
new PCRE release, though not all updates actually require this. Recompiling is
|
||||
definitely needed for release 7.2.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Cambridge CB2 3QH, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 13 June 2007
|
||||
Copyright (c) 1997-2007 University of Cambridge.
|
||||
.fi
|
||||
|
||||
@@ -59,8 +59,22 @@ need to add
|
||||
-R/usr/local/lib
|
||||
.sp
|
||||
(for example) to the compile command to get round this problem.
|
||||
.P
|
||||
.in 0
|
||||
Last updated: 09 September 2004
|
||||
.br
|
||||
Copyright (c) 1997-2004 University of Cambridge.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Cambridge CB2 3QH, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 23 January 2008
|
||||
Copyright (c) 1997-2008 University of Cambridge.
|
||||
.fi
|
||||
|
||||
+72
-27
@@ -52,7 +52,7 @@ frame for each matched character. For a long string, a lot of stack is
|
||||
required. Consider now this rewritten pattern, which matches exactly the same
|
||||
strings:
|
||||
.sp
|
||||
([^<]++|<(?!inet))
|
||||
([^<]++|<(?!inet))+
|
||||
.sp
|
||||
This uses very much less stack, because runs of characters that do not contain
|
||||
"<" are "swallowed" in one item inside the parentheses. Recursion happens only
|
||||
@@ -61,6 +61,13 @@ assume this is relatively rare). A possessive quantifier is used to stop any
|
||||
backtracking into the runs of non-"<" characters, but that is not related to
|
||||
stack usage.
|
||||
.P
|
||||
This example shows that one way of avoiding stack problems when matching long
|
||||
subject strings is to write repeated parenthesized subpatterns to match more
|
||||
than one character whenever possible.
|
||||
.
|
||||
.SS "Compiling PCRE to use heap instead of stack"
|
||||
.rs
|
||||
.sp
|
||||
In environments where stack memory is constrained, you might want to compile
|
||||
PCRE to use heap memory instead of stack for remembering back-up points. This
|
||||
makes it run a lot more slowly, however. Details of how to do this are given in
|
||||
@@ -68,27 +75,17 @@ the
|
||||
.\" HREF
|
||||
\fBpcrebuild\fP
|
||||
.\"
|
||||
documentation.
|
||||
.P
|
||||
In Unix-like environments, there is not often a problem with the stack, though
|
||||
the default limit on stack size varies from system to system. Values from 8Mb
|
||||
to 64Mb are common. You can find your default limit by running the command:
|
||||
documentation. When built in this way, instead of using the stack, PCRE obtains
|
||||
and frees memory by calling the functions that are pointed to by the
|
||||
\fBpcre_stack_malloc\fP and \fBpcre_stack_free\fP variables. By default, these
|
||||
point to \fBmalloc()\fP and \fBfree()\fP, but you can replace the pointers to
|
||||
cause PCRE to use your own functions. Since the block sizes are always the
|
||||
same, and are always freed in reverse order, it may be possible to implement
|
||||
customized memory handlers that are more efficient than the standard functions.
|
||||
.
|
||||
.SS "Limiting PCRE's stack usage"
|
||||
.rs
|
||||
.sp
|
||||
ulimit -s
|
||||
.sp
|
||||
The effect of running out of stack is often SIGSEGV, though sometimes an error
|
||||
message is given. You can normally increase the limit on stack size by code
|
||||
such as this:
|
||||
.sp
|
||||
struct rlimit rlim;
|
||||
getrlimit(RLIMIT_STACK, &rlim);
|
||||
rlim.rlim_cur = 100*1024*1024;
|
||||
setrlimit(RLIMIT_STACK, &rlim);
|
||||
.sp
|
||||
This reads the current limits (soft and hard) using \fBgetrlimit()\fP, then
|
||||
attempts to increase the soft limit to 100Mb using \fBsetrlimit()\fP. You must
|
||||
do this before calling \fBpcre_exec()\fP.
|
||||
.P
|
||||
PCRE has an internal counter that can be used to limit the depth of recursion,
|
||||
and thus cause \fBpcre_exec()\fP to give an error code before it runs out of
|
||||
stack. By default, the limit is very large, and unlikely ever to operate. It
|
||||
@@ -107,9 +104,57 @@ As a very rough rule of thumb, you should reckon on about 500 bytes per
|
||||
recursion. Thus, if you want to limit your stack usage to 8Mb, you
|
||||
should set the limit at 16000 recursions. A 64Mb stack, on the other hand, can
|
||||
support around 128000 recursions. The \fBpcretest\fP test program has a command
|
||||
line option (\fB-S\fP) that can be used to increase its stack.
|
||||
.P
|
||||
.in 0
|
||||
Last updated: 29 June 2006
|
||||
.br
|
||||
Copyright (c) 1997-2006 University of Cambridge.
|
||||
line option (\fB-S\fP) that can be used to increase the size of its stack.
|
||||
.
|
||||
.SS "Changing stack size in Unix-like systems"
|
||||
.rs
|
||||
.sp
|
||||
In Unix-like environments, there is not often a problem with the stack unless
|
||||
very long strings are involved, though the default limit on stack size varies
|
||||
from system to system. Values from 8Mb to 64Mb are common. You can find your
|
||||
default limit by running the command:
|
||||
.sp
|
||||
ulimit -s
|
||||
.sp
|
||||
Unfortunately, the effect of running out of stack is often SIGSEGV, though
|
||||
sometimes a more explicit error message is given. You can normally increase the
|
||||
limit on stack size by code such as this:
|
||||
.sp
|
||||
struct rlimit rlim;
|
||||
getrlimit(RLIMIT_STACK, &rlim);
|
||||
rlim.rlim_cur = 100*1024*1024;
|
||||
setrlimit(RLIMIT_STACK, &rlim);
|
||||
.sp
|
||||
This reads the current limits (soft and hard) using \fBgetrlimit()\fP, then
|
||||
attempts to increase the soft limit to 100Mb using \fBsetrlimit()\fP. You must
|
||||
do this before calling \fBpcre_exec()\fP.
|
||||
.
|
||||
.SS "Changing stack size in Mac OS X"
|
||||
.rs
|
||||
.sp
|
||||
Using \fBsetrlimit()\fP, as described above, should also work on Mac OS X. It
|
||||
is also possible to set a stack size when linking a program. There is a
|
||||
discussion about stack sizes in Mac OS X at this web site:
|
||||
.\" HTML <a href="http://developer.apple.com/qa/qa2005/qa1419.html">
|
||||
.\" </a>
|
||||
http://developer.apple.com/qa/qa2005/qa1419.html.
|
||||
.\"
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Cambridge CB2 3QH, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 09 July 2008
|
||||
Copyright (c) 1997-2008 University of Cambridge.
|
||||
.fi
|
||||
|
||||
@@ -0,0 +1,449 @@
|
||||
.TH PCRESYNTAX 3
|
||||
.SH NAME
|
||||
PCRE - Perl-compatible regular expressions
|
||||
.SH "PCRE REGULAR EXPRESSION SYNTAX SUMMARY"
|
||||
.rs
|
||||
.sp
|
||||
The full syntax and semantics of the regular expressions that are supported by
|
||||
PCRE are described in the
|
||||
.\" HREF
|
||||
\fBpcrepattern\fP
|
||||
.\"
|
||||
documentation. This document contains just a quick-reference summary of the
|
||||
syntax.
|
||||
.
|
||||
.
|
||||
.SH "QUOTING"
|
||||
.rs
|
||||
.sp
|
||||
\ex where x is non-alphanumeric is a literal x
|
||||
\eQ...\eE treat enclosed characters as literal
|
||||
.
|
||||
.
|
||||
.SH "CHARACTERS"
|
||||
.rs
|
||||
.sp
|
||||
\ea alarm, that is, the BEL character (hex 07)
|
||||
\ecx "control-x", where x is any character
|
||||
\ee escape (hex 1B)
|
||||
\ef formfeed (hex 0C)
|
||||
\en newline (hex 0A)
|
||||
\er carriage return (hex 0D)
|
||||
\et tab (hex 09)
|
||||
\eddd character with octal code ddd, or backreference
|
||||
\exhh character with hex code hh
|
||||
\ex{hhh..} character with hex code hhh..
|
||||
.
|
||||
.
|
||||
.SH "CHARACTER TYPES"
|
||||
.rs
|
||||
.sp
|
||||
. any character except newline;
|
||||
in dotall mode, any character whatsoever
|
||||
\eC one byte, even in UTF-8 mode (best avoided)
|
||||
\ed a decimal digit
|
||||
\eD a character that is not a decimal digit
|
||||
\eh a horizontal whitespace character
|
||||
\eH a character that is not a horizontal whitespace character
|
||||
\ep{\fIxx\fP} a character with the \fIxx\fP property
|
||||
\eP{\fIxx\fP} a character without the \fIxx\fP property
|
||||
\eR a newline sequence
|
||||
\es a whitespace character
|
||||
\eS a character that is not a whitespace character
|
||||
\ev a vertical whitespace character
|
||||
\eV a character that is not a vertical whitespace character
|
||||
\ew a "word" character
|
||||
\eW a "non-word" character
|
||||
\eX an extended Unicode sequence
|
||||
.sp
|
||||
In PCRE, \ed, \eD, \es, \eS, \ew, and \eW recognize only ASCII characters.
|
||||
.
|
||||
.
|
||||
.SH "GENERAL CATEGORY PROPERTY CODES FOR \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
C Other
|
||||
Cc Control
|
||||
Cf Format
|
||||
Cn Unassigned
|
||||
Co Private use
|
||||
Cs Surrogate
|
||||
.sp
|
||||
L Letter
|
||||
Ll Lower case letter
|
||||
Lm Modifier letter
|
||||
Lo Other letter
|
||||
Lt Title case letter
|
||||
Lu Upper case letter
|
||||
L& Ll, Lu, or Lt
|
||||
.sp
|
||||
M Mark
|
||||
Mc Spacing mark
|
||||
Me Enclosing mark
|
||||
Mn Non-spacing mark
|
||||
.sp
|
||||
N Number
|
||||
Nd Decimal number
|
||||
Nl Letter number
|
||||
No Other number
|
||||
.sp
|
||||
P Punctuation
|
||||
Pc Connector punctuation
|
||||
Pd Dash punctuation
|
||||
Pe Close punctuation
|
||||
Pf Final punctuation
|
||||
Pi Initial punctuation
|
||||
Po Other punctuation
|
||||
Ps Open punctuation
|
||||
.sp
|
||||
S Symbol
|
||||
Sc Currency symbol
|
||||
Sk Modifier symbol
|
||||
Sm Mathematical symbol
|
||||
So Other symbol
|
||||
.sp
|
||||
Z Separator
|
||||
Zl Line separator
|
||||
Zp Paragraph separator
|
||||
Zs Space separator
|
||||
.
|
||||
.
|
||||
.SH "SCRIPT NAMES FOR \ep AND \eP"
|
||||
.rs
|
||||
.sp
|
||||
Arabic,
|
||||
Armenian,
|
||||
Balinese,
|
||||
Bengali,
|
||||
Bopomofo,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanunoo,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Inherited,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khmer,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_B,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Malayalam,
|
||||
Mongolian,
|
||||
Myanmar,
|
||||
New_Tai_Lue,
|
||||
Nko,
|
||||
Ogham,
|
||||
Old_Italic,
|
||||
Old_Persian,
|
||||
Ol_Chiki,
|
||||
Oriya,
|
||||
Osmanya,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Rejang,
|
||||
Runic,
|
||||
Saurashtra,
|
||||
Shavian,
|
||||
Sinhala,
|
||||
Sudanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tamil,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Ugaritic,
|
||||
Vai,
|
||||
Yi.
|
||||
.
|
||||
.
|
||||
.SH "CHARACTER CLASSES"
|
||||
.rs
|
||||
.sp
|
||||
[...] positive character class
|
||||
[^...] negative character class
|
||||
[x-y] range (can be used for hex characters)
|
||||
[[:xxx:]] positive POSIX named set
|
||||
[[:^xxx:]] negative POSIX named set
|
||||
.sp
|
||||
alnum alphanumeric
|
||||
alpha alphabetic
|
||||
ascii 0-127
|
||||
blank space or tab
|
||||
cntrl control character
|
||||
digit decimal digit
|
||||
graph printing, excluding space
|
||||
lower lower case letter
|
||||
print printing, including space
|
||||
punct printing, excluding alphanumeric
|
||||
space whitespace
|
||||
upper upper case letter
|
||||
word same as \ew
|
||||
xdigit hexadecimal digit
|
||||
.sp
|
||||
In PCRE, POSIX character set names recognize only ASCII characters. You can use
|
||||
\eQ...\eE inside a character class.
|
||||
.
|
||||
.
|
||||
.SH "QUANTIFIERS"
|
||||
.rs
|
||||
.sp
|
||||
? 0 or 1, greedy
|
||||
?+ 0 or 1, possessive
|
||||
?? 0 or 1, lazy
|
||||
* 0 or more, greedy
|
||||
*+ 0 or more, possessive
|
||||
*? 0 or more, lazy
|
||||
+ 1 or more, greedy
|
||||
++ 1 or more, possessive
|
||||
+? 1 or more, lazy
|
||||
{n} exactly n
|
||||
{n,m} at least n, no more than m, greedy
|
||||
{n,m}+ at least n, no more than m, possessive
|
||||
{n,m}? at least n, no more than m, lazy
|
||||
{n,} n or more, greedy
|
||||
{n,}+ n or more, possessive
|
||||
{n,}? n or more, lazy
|
||||
.
|
||||
.
|
||||
.SH "ANCHORS AND SIMPLE ASSERTIONS"
|
||||
.rs
|
||||
.sp
|
||||
\eb word boundary (only ASCII letters recognized)
|
||||
\eB not a word boundary
|
||||
^ start of subject
|
||||
also after internal newline in multiline mode
|
||||
\eA start of subject
|
||||
$ end of subject
|
||||
also before newline at end of subject
|
||||
also before internal newline in multiline mode
|
||||
\eZ end of subject
|
||||
also before newline at end of subject
|
||||
\ez end of subject
|
||||
\eG first matching position in subject
|
||||
.
|
||||
.
|
||||
.SH "MATCH POINT RESET"
|
||||
.rs
|
||||
.sp
|
||||
\eK reset start of match
|
||||
.
|
||||
.
|
||||
.SH "ALTERNATION"
|
||||
.rs
|
||||
.sp
|
||||
expr|expr|expr...
|
||||
.
|
||||
.
|
||||
.SH "CAPTURING"
|
||||
.rs
|
||||
.sp
|
||||
(...) capturing group
|
||||
(?<name>...) named capturing group (Perl)
|
||||
(?'name'...) named capturing group (Perl)
|
||||
(?P<name>...) named capturing group (Python)
|
||||
(?:...) non-capturing group
|
||||
(?|...) non-capturing group; reset group numbers for
|
||||
capturing groups in each alternative
|
||||
.
|
||||
.
|
||||
.SH "ATOMIC GROUPS"
|
||||
.rs
|
||||
.sp
|
||||
(?>...) atomic, non-capturing group
|
||||
.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SH "COMMENT"
|
||||
.rs
|
||||
.sp
|
||||
(?#....) comment (not nestable)
|
||||
.
|
||||
.
|
||||
.SH "OPTION SETTING"
|
||||
.rs
|
||||
.sp
|
||||
(?i) caseless
|
||||
(?J) allow duplicate names
|
||||
(?m) multiline
|
||||
(?s) single line (dotall)
|
||||
(?U) default ungreedy (lazy)
|
||||
(?x) extended (ignore white space)
|
||||
(?-...) unset option(s)
|
||||
.sp
|
||||
The following is recognized only at the start of a pattern or after one of the
|
||||
newline-setting options with similar syntax:
|
||||
.sp
|
||||
(*UTF8) set UTF-8 mode
|
||||
.
|
||||
.
|
||||
.SH "LOOKAHEAD AND LOOKBEHIND ASSERTIONS"
|
||||
.rs
|
||||
.sp
|
||||
(?=...) positive look ahead
|
||||
(?!...) negative look ahead
|
||||
(?<=...) positive look behind
|
||||
(?<!...) negative look behind
|
||||
.sp
|
||||
Each top-level branch of a look behind must be of a fixed length.
|
||||
.
|
||||
.
|
||||
.SH "BACKREFERENCES"
|
||||
.rs
|
||||
.sp
|
||||
\en reference by number (can be ambiguous)
|
||||
\egn reference by number
|
||||
\eg{n} reference by number
|
||||
\eg{-n} relative reference by number
|
||||
\ek<name> reference by name (Perl)
|
||||
\ek'name' reference by name (Perl)
|
||||
\eg{name} reference by name (Perl)
|
||||
\ek{name} reference by name (.NET)
|
||||
(?P=name) reference by name (Python)
|
||||
.
|
||||
.
|
||||
.SH "SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)"
|
||||
.rs
|
||||
.sp
|
||||
(?R) recurse whole pattern
|
||||
(?n) call subpattern by absolute number
|
||||
(?+n) call subpattern by relative number
|
||||
(?-n) call subpattern by relative number
|
||||
(?&name) call subpattern by name (Perl)
|
||||
(?P>name) call subpattern by name (Python)
|
||||
\eg<name> call subpattern by name (Oniguruma)
|
||||
\eg'name' call subpattern by name (Oniguruma)
|
||||
\eg<n> call subpattern by absolute number (Oniguruma)
|
||||
\eg'n' call subpattern by absolute number (Oniguruma)
|
||||
\eg<+n> call subpattern by relative number (PCRE extension)
|
||||
\eg'+n' call subpattern by relative number (PCRE extension)
|
||||
\eg<-n> call subpattern by relative number (PCRE extension)
|
||||
\eg'-n' call subpattern by relative number (PCRE extension)
|
||||
.
|
||||
.
|
||||
.SH "CONDITIONAL PATTERNS"
|
||||
.rs
|
||||
.sp
|
||||
(?(condition)yes-pattern)
|
||||
(?(condition)yes-pattern|no-pattern)
|
||||
.sp
|
||||
(?(n)... absolute reference condition
|
||||
(?(+n)... relative reference condition
|
||||
(?(-n)... relative reference condition
|
||||
(?(<name>)... named reference condition (Perl)
|
||||
(?('name')... named reference condition (Perl)
|
||||
(?(name)... named reference condition (PCRE)
|
||||
(?(R)... overall recursion condition
|
||||
(?(Rn)... specific group recursion condition
|
||||
(?(R&name)... specific recursion condition
|
||||
(?(DEFINE)... define subpattern for reference
|
||||
(?(assert)... assertion condition
|
||||
.
|
||||
.
|
||||
.SH "BACKTRACKING CONTROL"
|
||||
.rs
|
||||
.sp
|
||||
The following act immediately they are reached:
|
||||
.sp
|
||||
(*ACCEPT) force successful match
|
||||
(*FAIL) force backtrack; synonym (*F)
|
||||
.sp
|
||||
The following act only when a subsequent match failure causes a backtrack to
|
||||
reach them. They all force a match failure, but they differ in what happens
|
||||
afterwards. Those that advance the start-of-match point do so only if the
|
||||
pattern is not anchored.
|
||||
.sp
|
||||
(*COMMIT) overall failure, no advance of starting point
|
||||
(*PRUNE) advance to next starting character
|
||||
(*SKIP) advance start to current matching position
|
||||
(*THEN) local failure, backtrack to next alternation
|
||||
.
|
||||
.
|
||||
.SH "NEWLINE CONVENTIONS"
|
||||
.rs
|
||||
.sp
|
||||
These are recognized only at the very start of the pattern or after a
|
||||
(*BSR_...) or (*UTF8) option.
|
||||
.sp
|
||||
(*CR) carriage return only
|
||||
(*LF) linefeed only
|
||||
(*CRLF) carriage return followed by linefeed
|
||||
(*ANYCRLF) all three of the above
|
||||
(*ANY) any Unicode newline sequence
|
||||
.
|
||||
.
|
||||
.SH "WHAT \eR MATCHES"
|
||||
.rs
|
||||
.sp
|
||||
These are recognized only at the very start of the pattern or after a
|
||||
(*...) option that sets the newline convention or UTF-8 mode.
|
||||
.sp
|
||||
(*BSR_ANYCRLF) CR, LF, or CRLF
|
||||
(*BSR_UNICODE) any Unicode newline sequence
|
||||
.
|
||||
.
|
||||
.SH "CALLOUTS"
|
||||
.rs
|
||||
.sp
|
||||
(?C) callout
|
||||
(?Cn) callout with data n
|
||||
.
|
||||
.
|
||||
.SH "SEE ALSO"
|
||||
.rs
|
||||
.sp
|
||||
\fBpcrepattern\fP(3), \fBpcreapi\fP(3), \fBpcrecallout\fP(3),
|
||||
\fBpcrematching\fP(3), \fBpcre\fP(3).
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Cambridge CB2 3QH, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 11 April 2009
|
||||
Copyright (c) 1997-2009 University of Cambridge.
|
||||
.fi
|
||||
+152
-55
@@ -24,23 +24,36 @@ documentation.
|
||||
.SH OPTIONS
|
||||
.rs
|
||||
.TP 10
|
||||
\fB-b\fP
|
||||
Behave as if each regex has the \fB/B\fP (show bytecode) modifier; the internal
|
||||
form is output after compilation.
|
||||
.TP 10
|
||||
\fB-C\fP
|
||||
Output the version number of the PCRE library, and all available information
|
||||
about the optional features that are included, and then exit.
|
||||
.TP 10
|
||||
\fB-d\fP
|
||||
Behave as if each regex has the \fB/D\fP (debug) modifier; the internal
|
||||
form is output after compilation.
|
||||
form and information about the compiled pattern is output after compilation;
|
||||
\fB-d\fP is equivalent to \fB-b -i\fP.
|
||||
.TP 10
|
||||
\fB-dfa\fP
|
||||
Behave as if each data line contains the \eD escape sequence; this causes the
|
||||
alternative matching function, \fBpcre_dfa_exec()\fP, to be used instead of the
|
||||
standard \fBpcre_exec()\fP function (more detail is given below).
|
||||
.TP 10
|
||||
\fB-help\fP
|
||||
Output a brief summary these options and then exit.
|
||||
.TP 10
|
||||
\fB-i\fP
|
||||
Behave as if each regex has the \fB/I\fP modifier; information about the
|
||||
compiled pattern is given after compilation.
|
||||
.TP 10
|
||||
\fB-M\fP
|
||||
Behave as if each data line contains the \eM escape sequence; this causes
|
||||
PCRE to discover the minimum MATCH_LIMIT and MATCH_LIMIT_RECURSION settings by
|
||||
calling \fBpcre_exec()\fP repeatedly with different limits.
|
||||
.TP 10
|
||||
\fB-m\fP
|
||||
Output the size of each compiled pattern after it has been compiled. This is
|
||||
equivalent to adding \fB/M\fP to each regular expression. For compatibility
|
||||
@@ -48,9 +61,11 @@ with earlier versions of pcretest, \fB-s\fP is a synonym for \fB-m\fP.
|
||||
.TP 10
|
||||
\fB-o\fP \fIosize\fP
|
||||
Set the number of elements in the output vector that is used when calling
|
||||
\fBpcre_exec()\fP to be \fIosize\fP. The default value is 45, which is enough
|
||||
for 14 capturing subexpressions. The vector size can be changed for individual
|
||||
matching calls by including \eO in the data line (see below).
|
||||
\fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP to be \fIosize\fP. The default value
|
||||
is 45, which is enough for 14 capturing subexpressions for \fBpcre_exec()\fP or
|
||||
22 different matches for \fBpcre_dfa_exec()\fP. The vector size can be
|
||||
changed for individual matching calls by including \eO in the data line (see
|
||||
below).
|
||||
.TP 10
|
||||
\fB-p\fP
|
||||
Behave as if each regex has the \fB/P\fP modifier; the POSIX wrapper API is
|
||||
@@ -68,7 +83,14 @@ megabytes.
|
||||
Run each compile, study, and match many times with a timer, and output
|
||||
resulting time per compile or match (in milliseconds). Do not set \fB-m\fP with
|
||||
\fB-t\fP, because you will then get the size output a zillion times, and the
|
||||
timing will be distorted.
|
||||
timing will be distorted. You can control the number of iterations that are
|
||||
used for timing by following \fB-t\fP with a number (as a separate item on the
|
||||
command line). For example, "-t 1000" would iterate 1000 times. The default is
|
||||
to iterate 500000 times.
|
||||
.TP 10
|
||||
\fB-tm\fP
|
||||
This is like \fB-t\fP except that it times only the matching phase, not the
|
||||
compile or study phases.
|
||||
.
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
@@ -80,14 +102,20 @@ that file and writes to stdout. Otherwise, it reads from stdin and writes to
|
||||
stdout, and prompts for each line of input, using "re>" to prompt for regular
|
||||
expressions, and "data>" to prompt for data lines.
|
||||
.P
|
||||
When \fBpcretest\fP is built, a configuration option can specify that it should
|
||||
be linked with the \fBlibreadline\fP library. When this is done, if the input
|
||||
is from a terminal, it is read using the \fBreadline()\fP function. This
|
||||
provides line-editing and history facilities. The output from the \fB-help\fP
|
||||
option states whether or not \fBreadline()\fP will be used.
|
||||
.P
|
||||
The program handles any number of sets of input on a single input file. Each
|
||||
set starts with a regular expression, and continues with any number of data
|
||||
lines to be matched against the pattern.
|
||||
.P
|
||||
Each data line is matched separately and independently. If you want to do
|
||||
multi-line matches, you have to use the \en escape sequence (or \er or \er\en,
|
||||
depending on the newline setting) in a single line of input to encode the
|
||||
newline characters. There is no limit on the length of data lines; the input
|
||||
etc., depending on the newline setting) in a single line of input to encode the
|
||||
newline sequences. There is no limit on the length of data lines; the input
|
||||
buffer is automatically extended if it is too small.
|
||||
.P
|
||||
An empty line signals the end of the data lines, at which point a new regular
|
||||
@@ -140,20 +168,30 @@ effect as they do in Perl. For example:
|
||||
The following table shows additional modifiers for setting PCRE options that do
|
||||
not correspond to anything in Perl:
|
||||
.sp
|
||||
\fB/A\fP PCRE_ANCHORED
|
||||
\fB/C\fP PCRE_AUTO_CALLOUT
|
||||
\fB/E\fP PCRE_DOLLAR_ENDONLY
|
||||
\fB/f\fP PCRE_FIRSTLINE
|
||||
\fB/J\fP PCRE_DUPNAMES
|
||||
\fB/N\fP PCRE_NO_AUTO_CAPTURE
|
||||
\fB/U\fP PCRE_UNGREEDY
|
||||
\fB/X\fP PCRE_EXTRA
|
||||
\fB/<cr>\fP PCRE_NEWLINE_CR
|
||||
\fB/<lf>\fP PCRE_NEWLINE_LF
|
||||
\fB/<crlf>\fP PCRE_NEWLINE_CRLF
|
||||
\fB/A\fP PCRE_ANCHORED
|
||||
\fB/C\fP PCRE_AUTO_CALLOUT
|
||||
\fB/E\fP PCRE_DOLLAR_ENDONLY
|
||||
\fB/f\fP PCRE_FIRSTLINE
|
||||
\fB/J\fP PCRE_DUPNAMES
|
||||
\fB/N\fP PCRE_NO_AUTO_CAPTURE
|
||||
\fB/U\fP PCRE_UNGREEDY
|
||||
\fB/X\fP PCRE_EXTRA
|
||||
\fB/<JS>\fP PCRE_JAVASCRIPT_COMPAT
|
||||
\fB/<cr>\fP PCRE_NEWLINE_CR
|
||||
\fB/<lf>\fP PCRE_NEWLINE_LF
|
||||
\fB/<crlf>\fP PCRE_NEWLINE_CRLF
|
||||
\fB/<anycrlf>\fP PCRE_NEWLINE_ANYCRLF
|
||||
\fB/<any>\fP PCRE_NEWLINE_ANY
|
||||
\fB/<bsr_anycrlf>\fP PCRE_BSR_ANYCRLF
|
||||
\fB/<bsr_unicode>\fP PCRE_BSR_UNICODE
|
||||
.sp
|
||||
Those specifying line endings are literal strings as shown. Details of the
|
||||
meanings of these PCRE options are given in the
|
||||
Those specifying line ending sequences are literal strings as shown, but the
|
||||
letters can be in either case. This example sets multiline matching with CRLF
|
||||
as the line ending sequence:
|
||||
.sp
|
||||
/^abc/m<crlf>
|
||||
.sp
|
||||
Details of the meanings of these PCRE options are given in the
|
||||
.\" HREF
|
||||
\fBpcreapi\fP
|
||||
.\"
|
||||
@@ -191,6 +229,13 @@ matched the entire pattern, pcretest should in addition output the remainder of
|
||||
the subject string. This is useful for tests where the subject contains
|
||||
multiple copies of the same substring.
|
||||
.P
|
||||
The \fB/B\fP modifier is a debugging feature. It requests that \fBpcretest\fP
|
||||
output a representation of the compiled byte code after compilation. Normally
|
||||
this information contains length and offset values; however, if \fB/Z\fP is
|
||||
also present, this data is replaced by spaces. This is a special feature for
|
||||
use in the automatic test scripts; it ensures that the same output is generated
|
||||
for different internal link sizes.
|
||||
.P
|
||||
The \fB/L\fP modifier must be followed directly by the name of a locale, for
|
||||
example,
|
||||
.sp
|
||||
@@ -207,10 +252,8 @@ compiled pattern (whether it is anchored, has a fixed first character, and
|
||||
so on). It does this by calling \fBpcre_fullinfo()\fP after compiling a
|
||||
pattern. If the pattern is studied, the results of that are also output.
|
||||
.P
|
||||
The \fB/D\fP modifier is a PCRE debugging feature, which also assumes \fB/I\fP.
|
||||
It causes the internal form of compiled regular expressions to be output after
|
||||
compilation. If the pattern was studied, the information returned is also
|
||||
output.
|
||||
The \fB/D\fP modifier is a PCRE debugging feature, and is equivalent to
|
||||
\fB/BI\fP, that is, both the \fB/B\fP and the \fB/I\fP modifiers.
|
||||
.P
|
||||
The \fB/F\fP modifier causes \fBpcretest\fP to flip the byte order of the
|
||||
fields in the compiled pattern that contain 2-byte and 4-byte numbers. This
|
||||
@@ -254,17 +297,17 @@ complicated features of PCRE. If you are just testing "ordinary" regular
|
||||
expressions, you probably don't need any of these. The following escapes are
|
||||
recognized:
|
||||
.sp
|
||||
\ea alarm (= BEL)
|
||||
\eb backspace
|
||||
\ee escape
|
||||
\ef formfeed
|
||||
\en newline
|
||||
\ea alarm (BEL, \ex07)
|
||||
\eb backspace (\ex08)
|
||||
\ee escape (\ex27)
|
||||
\ef formfeed (\ex0c)
|
||||
\en newline (\ex0a)
|
||||
.\" JOIN
|
||||
\eqdd set the PCRE_MATCH_LIMIT limit to dd
|
||||
(any number of digits)
|
||||
\er carriage return
|
||||
\et tab
|
||||
\ev vertical tab
|
||||
\er carriage return (\ex0d)
|
||||
\et tab (\ex09)
|
||||
\ev vertical tab (\ex0b)
|
||||
\ennn octal character (up to 3 octal digits)
|
||||
\exhh hexadecimal character (up to 2 hex digits)
|
||||
.\" JOIN
|
||||
@@ -344,11 +387,20 @@ recognized:
|
||||
.\" JOIN
|
||||
\e<crlf> pass the PCRE_NEWLINE_CRLF option to \fBpcre_exec()\fP
|
||||
or \fBpcre_dfa_exec()\fP
|
||||
.\" JOIN
|
||||
\e<anycrlf> pass the PCRE_NEWLINE_ANYCRLF option to \fBpcre_exec()\fP
|
||||
or \fBpcre_dfa_exec()\fP
|
||||
.\" JOIN
|
||||
\e<any> pass the PCRE_NEWLINE_ANY option to \fBpcre_exec()\fP
|
||||
or \fBpcre_dfa_exec()\fP
|
||||
.sp
|
||||
The escapes that specify line endings are literal strings, exactly as shown.
|
||||
A backslash followed by anything else just escapes the anything else. If the
|
||||
very last character is a backslash, it is ignored. This gives a way of passing
|
||||
an empty line as data, since a real empty line terminates the data input.
|
||||
The escapes that specify line ending sequences are literal strings, exactly as
|
||||
shown. No more than one newline setting should be present in any data line.
|
||||
.P
|
||||
A backslash followed by anything else just escapes the anything else. If
|
||||
the very last character is a backslash, it is ignored. This gives a way of
|
||||
passing an empty line as data, since a real empty line terminates the data
|
||||
input.
|
||||
.P
|
||||
If \eM is present, \fBpcretest\fP calls \fBpcre_exec()\fP several times, with
|
||||
different values in the \fImatch_limit\fP and \fImatch_limit_recursion\fP
|
||||
@@ -374,7 +426,10 @@ and \eZ, causing REG_NOTBOL and REG_NOTEOL, respectively, to be passed to
|
||||
The use of \ex{hh...} to represent UTF-8 characters is not dependent on the use
|
||||
of the \fB/8\fP modifier on the pattern. It is recognized always. There may be
|
||||
any number of hexadecimal digits inside the braces. The result is from one to
|
||||
six bytes, encoded according to the UTF-8 rules.
|
||||
six bytes, encoded according to the original UTF-8 rules of RFC 2279. This
|
||||
allows for values in the range 0 to 0x7FFFFFFF. Note that not all of those are
|
||||
valid Unicode code points, or indeed valid UTF-8 characters according to the
|
||||
later rules in RFC 3629.
|
||||
.
|
||||
.
|
||||
.SH "THE ALTERNATIVE MATCHING FUNCTION"
|
||||
@@ -411,7 +466,7 @@ respectively, and otherwise the PCRE negative error number. Here is an example
|
||||
of an interactive \fBpcretest\fP run.
|
||||
.sp
|
||||
$ pcretest
|
||||
PCRE version 5.00 07-Sep-2004
|
||||
PCRE version 7.0 30-Nov-2006
|
||||
.sp
|
||||
re> /^abc(\ed+)/
|
||||
data> abc123
|
||||
@@ -420,11 +475,26 @@ of an interactive \fBpcretest\fP run.
|
||||
data> xyz
|
||||
No match
|
||||
.sp
|
||||
Note that unset capturing substrings that are not followed by one that is set
|
||||
are not returned by \fBpcre_exec()\fP, and are not shown by \fBpcretest\fP. In
|
||||
the following example, there are two capturing substrings, but when the first
|
||||
data line is matched, the second, unset substring is not shown. An "internal"
|
||||
unset substring is shown as "<unset>", as for the second data line.
|
||||
.sp
|
||||
re> /(a)|(b)/
|
||||
data> a
|
||||
0: a
|
||||
1: a
|
||||
data> b
|
||||
0: b
|
||||
1: <unset>
|
||||
2: b
|
||||
.sp
|
||||
If the strings contain any non-printing characters, they are output as \e0x
|
||||
escapes, or as \ex{...} escapes if the \fB/8\fP modifier was present on the
|
||||
pattern. If the pattern has the \fB/+\fP modifier, the output for substring 0
|
||||
is followed by the the rest of the subject string, identified by "0+" like
|
||||
this:
|
||||
pattern. See below for the definition of non-printing characters. If the
|
||||
pattern has the \fB/+\fP modifier, the output for substring 0 is followed by
|
||||
the the rest of the subject string, identified by "0+" like this:
|
||||
.sp
|
||||
re> /cat/+
|
||||
data> cataract
|
||||
@@ -452,10 +522,11 @@ instead of a colon. This is in addition to the normal full list. The string
|
||||
length (that is, the return from the extraction function) is given in
|
||||
parentheses after each string for \fB\eC\fP and \fB\eG\fP.
|
||||
.P
|
||||
Note that while patterns can be continued over several lines (a plain ">"
|
||||
Note that whereas patterns can be continued over several lines (a plain ">"
|
||||
prompt is used for continuations), data lines may not. However newlines can be
|
||||
included in data by means of the \en escape (or \er or \er\en for those newline
|
||||
settings).
|
||||
included in data by means of the \en escape (or \er, \er\en, etc., depending on
|
||||
the newline sequence setting).
|
||||
.
|
||||
.
|
||||
.
|
||||
.SH "OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION"
|
||||
@@ -475,7 +546,7 @@ the subject where there is at least one match. For example:
|
||||
(Using the normal matching function on this data finds only "tang".) The
|
||||
longest matching string is always given first (and numbered zero).
|
||||
.P
|
||||
If \fB/g\P is present on the pattern, the search for further matches resumes
|
||||
If \fB/g\fP is present on the pattern, the search for further matches resumes
|
||||
at the end of the longest match. For example:
|
||||
.sp
|
||||
re> /(tang|tangerine|tan)/g
|
||||
@@ -499,7 +570,7 @@ indicating that the subject partially matched the pattern, you can restart the
|
||||
match with additional subject data by means of the \eR escape sequence. For
|
||||
example:
|
||||
.sp
|
||||
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||
re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
|
||||
data> 23ja\eP\eD
|
||||
Partial match: 23ja
|
||||
data> n05\eR\eD
|
||||
@@ -556,6 +627,21 @@ the
|
||||
documentation.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SH "NON-PRINTING CHARACTERS"
|
||||
.rs
|
||||
.sp
|
||||
When \fBpcretest\fP is outputting text in the compiled version of a pattern,
|
||||
bytes other than 32-126 are always treated as non-printing characters are are
|
||||
therefore shown as hex escapes.
|
||||
.P
|
||||
When \fBpcretest\fP is outputting text that is a matched part of a subject
|
||||
string, it behaves in the same way, unless a different locale has been set for
|
||||
the pattern (using the \fB/L\fP modifier). In this case, the \fBisprint()\fP
|
||||
function to distinguish printing and non-printing characters.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SH "SAVING AND RELOADING COMPILED PATTERNS"
|
||||
.rs
|
||||
.sp
|
||||
@@ -616,16 +702,27 @@ Finally, if you attempt to load a file that is not in the correct format, the
|
||||
result is undefined.
|
||||
.
|
||||
.
|
||||
.SH "SEE ALSO"
|
||||
.rs
|
||||
.sp
|
||||
\fBpcre\fP(3), \fBpcreapi\fP(3), \fBpcrecallout\fP(3), \fBpcrematching\fP(3),
|
||||
\fBpcrepartial\fP(d), \fBpcrepattern\fP(3), \fBpcreprecompile\fP(3).
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
.br
|
||||
University Computing Service,
|
||||
.br
|
||||
Cambridge CB2 3QG, England.
|
||||
.P
|
||||
.in 0
|
||||
Last updated: 29 June 2006
|
||||
.br
|
||||
Copyright (c) 1997-2006 University of Cambridge.
|
||||
University Computing Service
|
||||
Cambridge CB2 3QH, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 10 March 2009
|
||||
Copyright (c) 1997-2009 University of Cambridge.
|
||||
.fi
|
||||
|
||||
+242
-152
@@ -19,67 +19,93 @@ SYNOPSIS
|
||||
|
||||
OPTIONS
|
||||
|
||||
-b Behave as if each regex has the /B (show bytecode) modifier;
|
||||
the internal form is output after compilation.
|
||||
|
||||
-C Output the version number of the PCRE library, and all avail-
|
||||
able information about the optional features that are
|
||||
able information about the optional features that are
|
||||
included, and then exit.
|
||||
|
||||
-d Behave as if each regex has the /D (debug) modifier; the
|
||||
internal form is output after compilation.
|
||||
-d Behave as if each regex has the /D (debug) modifier; the
|
||||
internal form and information about the compiled pattern is
|
||||
output after compilation; -d is equivalent to -b -i.
|
||||
|
||||
-dfa Behave as if each data line contains the \D escape sequence;
|
||||
this causes the alternative matching function,
|
||||
pcre_dfa_exec(), to be used instead of the standard
|
||||
pcre_exec() function (more detail is given below).
|
||||
|
||||
-help Output a brief summary these options and then exit.
|
||||
|
||||
-i Behave as if each regex has the /I modifier; information
|
||||
about the compiled pattern is given after compilation.
|
||||
|
||||
-m Output the size of each compiled pattern after it has been
|
||||
compiled. This is equivalent to adding /M to each regular
|
||||
expression. For compatibility with earlier versions of
|
||||
-M Behave as if each data line contains the \M escape sequence;
|
||||
this causes PCRE to discover the minimum MATCH_LIMIT and
|
||||
MATCH_LIMIT_RECURSION settings by calling pcre_exec() repeat-
|
||||
edly with different limits.
|
||||
|
||||
-m Output the size of each compiled pattern after it has been
|
||||
compiled. This is equivalent to adding /M to each regular
|
||||
expression. For compatibility with earlier versions of
|
||||
pcretest, -s is a synonym for -m.
|
||||
|
||||
-o osize Set the number of elements in the output vector that is used
|
||||
when calling pcre_exec() to be osize. The default value is
|
||||
45, which is enough for 14 capturing subexpressions. The vec-
|
||||
tor size can be changed for individual matching calls by
|
||||
including \O in the data line (see below).
|
||||
-o osize Set the number of elements in the output vector that is used
|
||||
when calling pcre_exec() or pcre_dfa_exec() to be osize. The
|
||||
default value is 45, which is enough for 14 capturing subex-
|
||||
pressions for pcre_exec() or 22 different matches for
|
||||
pcre_dfa_exec(). The vector size can be changed for individ-
|
||||
ual matching calls by including \O in the data line (see
|
||||
below).
|
||||
|
||||
-p Behave as if each regex has the /P modifier; the POSIX wrap-
|
||||
per API is used to call PCRE. None of the other options has
|
||||
-p Behave as if each regex has the /P modifier; the POSIX wrap-
|
||||
per API is used to call PCRE. None of the other options has
|
||||
any effect when -p is set.
|
||||
|
||||
-q Do not output the version number of pcretest at the start of
|
||||
-q Do not output the version number of pcretest at the start of
|
||||
execution.
|
||||
|
||||
-S size On Unix-like systems, set the size of the runtime stack to
|
||||
-S size On Unix-like systems, set the size of the runtime stack to
|
||||
size megabytes.
|
||||
|
||||
-t Run each compile, study, and match many times with a timer,
|
||||
and output resulting time per compile or match (in millisec-
|
||||
onds). Do not set -m with -t, because you will then get the
|
||||
size output a zillion times, and the timing will be dis-
|
||||
torted.
|
||||
-t Run each compile, study, and match many times with a timer,
|
||||
and output resulting time per compile or match (in millisec-
|
||||
onds). Do not set -m with -t, because you will then get the
|
||||
size output a zillion times, and the timing will be dis-
|
||||
torted. You can control the number of iterations that are
|
||||
used for timing by following -t with a number (as a separate
|
||||
item on the command line). For example, "-t 1000" would iter-
|
||||
ate 1000 times. The default is to iterate 500000 times.
|
||||
|
||||
-tm This is like -t except that it times only the matching phase,
|
||||
not the compile or study phases.
|
||||
|
||||
|
||||
DESCRIPTION
|
||||
|
||||
If pcretest is given two filename arguments, it reads from the first
|
||||
If pcretest is given two filename arguments, it reads from the first
|
||||
and writes to the second. If it is given only one filename argument, it
|
||||
reads from that file and writes to stdout. Otherwise, it reads from
|
||||
stdin and writes to stdout, and prompts for each line of input, using
|
||||
reads from that file and writes to stdout. Otherwise, it reads from
|
||||
stdin and writes to stdout, and prompts for each line of input, using
|
||||
"re>" to prompt for regular expressions, and "data>" to prompt for data
|
||||
lines.
|
||||
|
||||
When pcretest is built, a configuration option can specify that it
|
||||
should be linked with the libreadline library. When this is done, if
|
||||
the input is from a terminal, it is read using the readline() function.
|
||||
This provides line-editing and history facilities. The output from the
|
||||
-help option states whether or not readline() will be used.
|
||||
|
||||
The program handles any number of sets of input on a single input file.
|
||||
Each set starts with a regular expression, and continues with any num-
|
||||
Each set starts with a regular expression, and continues with any num-
|
||||
ber of data lines to be matched against the pattern.
|
||||
|
||||
Each data line is matched separately and independently. If you want to
|
||||
Each data line is matched separately and independently. If you want to
|
||||
do multi-line matches, you have to use the \n escape sequence (or \r or
|
||||
\r\n, depending on the newline setting) in a single line of input to
|
||||
encode the newline characters. There is no limit on the length of data
|
||||
lines; the input buffer is automatically extended if it is too small.
|
||||
\r\n, etc., depending on the newline setting) in a single line of input
|
||||
to encode the newline sequences. There is no limit on the length of
|
||||
data lines; the input buffer is automatically extended if it is too
|
||||
small.
|
||||
|
||||
An empty line signals the end of the data lines, at which point a new
|
||||
regular expression is read. The regular expressions are given enclosed
|
||||
@@ -131,39 +157,49 @@ PATTERN MODIFIERS
|
||||
The following table shows additional modifiers for setting PCRE options
|
||||
that do not correspond to anything in Perl:
|
||||
|
||||
/A PCRE_ANCHORED
|
||||
/C PCRE_AUTO_CALLOUT
|
||||
/E PCRE_DOLLAR_ENDONLY
|
||||
/f PCRE_FIRSTLINE
|
||||
/J PCRE_DUPNAMES
|
||||
/N PCRE_NO_AUTO_CAPTURE
|
||||
/U PCRE_UNGREEDY
|
||||
/X PCRE_EXTRA
|
||||
/<cr> PCRE_NEWLINE_CR
|
||||
/<lf> PCRE_NEWLINE_LF
|
||||
/<crlf> PCRE_NEWLINE_CRLF
|
||||
/A PCRE_ANCHORED
|
||||
/C PCRE_AUTO_CALLOUT
|
||||
/E PCRE_DOLLAR_ENDONLY
|
||||
/f PCRE_FIRSTLINE
|
||||
/J PCRE_DUPNAMES
|
||||
/N PCRE_NO_AUTO_CAPTURE
|
||||
/U PCRE_UNGREEDY
|
||||
/X PCRE_EXTRA
|
||||
/<JS> PCRE_JAVASCRIPT_COMPAT
|
||||
/<cr> PCRE_NEWLINE_CR
|
||||
/<lf> PCRE_NEWLINE_LF
|
||||
/<crlf> PCRE_NEWLINE_CRLF
|
||||
/<anycrlf> PCRE_NEWLINE_ANYCRLF
|
||||
/<any> PCRE_NEWLINE_ANY
|
||||
/<bsr_anycrlf> PCRE_BSR_ANYCRLF
|
||||
/<bsr_unicode> PCRE_BSR_UNICODE
|
||||
|
||||
Those specifying line endings are literal strings as shown. Details of
|
||||
the meanings of these PCRE options are given in the pcreapi documenta-
|
||||
tion.
|
||||
Those specifying line ending sequences are literal strings as shown,
|
||||
but the letters can be in either case. This example sets multiline
|
||||
matching with CRLF as the line ending sequence:
|
||||
|
||||
/^abc/m<crlf>
|
||||
|
||||
Details of the meanings of these PCRE options are given in the pcreapi
|
||||
documentation.
|
||||
|
||||
Finding all matches in a string
|
||||
|
||||
Searching for all possible matches within each subject string can be
|
||||
requested by the /g or /G modifier. After finding a match, PCRE is
|
||||
Searching for all possible matches within each subject string can be
|
||||
requested by the /g or /G modifier. After finding a match, PCRE is
|
||||
called again to search the remainder of the subject string. The differ-
|
||||
ence between /g and /G is that the former uses the startoffset argument
|
||||
to pcre_exec() to start searching at a new point within the entire
|
||||
string (which is in effect what Perl does), whereas the latter passes
|
||||
over a shortened substring. This makes a difference to the matching
|
||||
to pcre_exec() to start searching at a new point within the entire
|
||||
string (which is in effect what Perl does), whereas the latter passes
|
||||
over a shortened substring. This makes a difference to the matching
|
||||
process if the pattern begins with a lookbehind assertion (including \b
|
||||
or \B).
|
||||
|
||||
If any call to pcre_exec() in a /g or /G sequence matches an empty
|
||||
string, the next call is done with the PCRE_NOTEMPTY and PCRE_ANCHORED
|
||||
flags set in order to search for another, non-empty, match at the same
|
||||
point. If this second match fails, the start offset is advanced by
|
||||
one, and the normal match is retried. This imitates the way Perl han-
|
||||
If any call to pcre_exec() in a /g or /G sequence matches an empty
|
||||
string, the next call is done with the PCRE_NOTEMPTY and PCRE_ANCHORED
|
||||
flags set in order to search for another, non-empty, match at the same
|
||||
point. If this second match fails, the start offset is advanced by
|
||||
one, and the normal match is retried. This imitates the way Perl han-
|
||||
dles such cases when using the /g modifier or the split() function.
|
||||
|
||||
Other modifiers
|
||||
@@ -175,38 +211,43 @@ PATTERN MODIFIERS
|
||||
remainder of the subject string. This is useful for tests where the
|
||||
subject contains multiple copies of the same substring.
|
||||
|
||||
The /L modifier must be followed directly by the name of a locale, for
|
||||
The /B modifier is a debugging feature. It requests that pcretest out-
|
||||
put a representation of the compiled byte code after compilation. Nor-
|
||||
mally this information contains length and offset values; however, if
|
||||
/Z is also present, this data is replaced by spaces. This is a special
|
||||
feature for use in the automatic test scripts; it ensures that the same
|
||||
output is generated for different internal link sizes.
|
||||
|
||||
The /L modifier must be followed directly by the name of a locale, for
|
||||
example,
|
||||
|
||||
/pattern/Lfr_FR
|
||||
|
||||
For this reason, it must be the last modifier. The given locale is set,
|
||||
pcre_maketables() is called to build a set of character tables for the
|
||||
locale, and this is then passed to pcre_compile() when compiling the
|
||||
regular expression. Without an /L modifier, NULL is passed as the
|
||||
tables pointer; that is, /L applies only to the expression on which it
|
||||
pcre_maketables() is called to build a set of character tables for the
|
||||
locale, and this is then passed to pcre_compile() when compiling the
|
||||
regular expression. Without an /L modifier, NULL is passed as the
|
||||
tables pointer; that is, /L applies only to the expression on which it
|
||||
appears.
|
||||
|
||||
The /I modifier requests that pcretest output information about the
|
||||
compiled pattern (whether it is anchored, has a fixed first character,
|
||||
and so on). It does this by calling pcre_fullinfo() after compiling a
|
||||
pattern. If the pattern is studied, the results of that are also out-
|
||||
The /I modifier requests that pcretest output information about the
|
||||
compiled pattern (whether it is anchored, has a fixed first character,
|
||||
and so on). It does this by calling pcre_fullinfo() after compiling a
|
||||
pattern. If the pattern is studied, the results of that are also out-
|
||||
put.
|
||||
|
||||
The /D modifier is a PCRE debugging feature, which also assumes /I. It
|
||||
causes the internal form of compiled regular expressions to be output
|
||||
after compilation. If the pattern was studied, the information returned
|
||||
is also output.
|
||||
The /D modifier is a PCRE debugging feature, and is equivalent to /BI,
|
||||
that is, both the /B and the /I modifiers.
|
||||
|
||||
The /F modifier causes pcretest to flip the byte order of the fields in
|
||||
the compiled pattern that contain 2-byte and 4-byte numbers. This
|
||||
facility is for testing the feature in PCRE that allows it to execute
|
||||
the compiled pattern that contain 2-byte and 4-byte numbers. This
|
||||
facility is for testing the feature in PCRE that allows it to execute
|
||||
patterns that were compiled on a host with a different endianness. This
|
||||
feature is not available when the POSIX interface to PCRE is being
|
||||
used, that is, when the /P pattern modifier is specified. See also the
|
||||
feature is not available when the POSIX interface to PCRE is being
|
||||
used, that is, when the /P pattern modifier is specified. See also the
|
||||
section about saving and reloading compiled patterns below.
|
||||
|
||||
The /S modifier causes pcre_study() to be called after the expression
|
||||
The /S modifier causes pcre_study() to be called after the expression
|
||||
has been compiled, and the results used when the expression is matched.
|
||||
|
||||
The /M modifier causes the size of memory block used to hold the com-
|
||||
@@ -216,38 +257,38 @@ PATTERN MODIFIERS
|
||||
rather than its native API. When this is done, all other modifiers
|
||||
except /i, /m, and /+ are ignored. REG_ICASE is set if /i is present,
|
||||
and REG_NEWLINE is set if /m is present. The wrapper functions force
|
||||
PCRE_DOLLAR_ENDONLY always, and PCRE_DOTALL unless REG_NEWLINE is set.
|
||||
PCRE_DOLLAR_ENDONLY always, and PCRE_DOTALL unless REG_NEWLINE is set.
|
||||
|
||||
The /8 modifier causes pcretest to call PCRE with the PCRE_UTF8 option
|
||||
set. This turns on support for UTF-8 character handling in PCRE, pro-
|
||||
vided that it was compiled with this support enabled. This modifier
|
||||
The /8 modifier causes pcretest to call PCRE with the PCRE_UTF8 option
|
||||
set. This turns on support for UTF-8 character handling in PCRE, pro-
|
||||
vided that it was compiled with this support enabled. This modifier
|
||||
also causes any non-printing characters in output strings to be printed
|
||||
using the \x{hh...} notation if they are valid UTF-8 sequences.
|
||||
|
||||
If the /? modifier is used with /8, it causes pcretest to call
|
||||
pcre_compile() with the PCRE_NO_UTF8_CHECK option, to suppress the
|
||||
If the /? modifier is used with /8, it causes pcretest to call
|
||||
pcre_compile() with the PCRE_NO_UTF8_CHECK option, to suppress the
|
||||
checking of the string for UTF-8 validity.
|
||||
|
||||
|
||||
DATA LINES
|
||||
|
||||
Before each data line is passed to pcre_exec(), leading and trailing
|
||||
whitespace is removed, and it is then scanned for \ escapes. Some of
|
||||
these are pretty esoteric features, intended for checking out some of
|
||||
the more complicated features of PCRE. If you are just testing "ordi-
|
||||
nary" regular expressions, you probably don't need any of these. The
|
||||
Before each data line is passed to pcre_exec(), leading and trailing
|
||||
whitespace is removed, and it is then scanned for \ escapes. Some of
|
||||
these are pretty esoteric features, intended for checking out some of
|
||||
the more complicated features of PCRE. If you are just testing "ordi-
|
||||
nary" regular expressions, you probably don't need any of these. The
|
||||
following escapes are recognized:
|
||||
|
||||
\a alarm (= BEL)
|
||||
\b backspace
|
||||
\e escape
|
||||
\f formfeed
|
||||
\n newline
|
||||
\a alarm (BEL, \x07)
|
||||
\b backspace (\x08)
|
||||
\e escape (\x27)
|
||||
\f formfeed (\x0c)
|
||||
\n newline (\x0a)
|
||||
\qdd set the PCRE_MATCH_LIMIT limit to dd
|
||||
(any number of digits)
|
||||
\r carriage return
|
||||
\t tab
|
||||
\v vertical tab
|
||||
\r carriage return (\x0d)
|
||||
\t tab (\x09)
|
||||
\v vertical tab (\x0b)
|
||||
\nnn octal character (up to 3 octal digits)
|
||||
\xhh hexadecimal character (up to 2 hex digits)
|
||||
\x{hh...} hexadecimal character, any number of digits
|
||||
@@ -304,12 +345,19 @@ DATA LINES
|
||||
or pcre_dfa_exec()
|
||||
\<crlf> pass the PCRE_NEWLINE_CRLF option to pcre_exec()
|
||||
or pcre_dfa_exec()
|
||||
\<anycrlf> pass the PCRE_NEWLINE_ANYCRLF option to pcre_exec()
|
||||
or pcre_dfa_exec()
|
||||
\<any> pass the PCRE_NEWLINE_ANY option to pcre_exec()
|
||||
or pcre_dfa_exec()
|
||||
|
||||
The escapes that specify line endings are literal strings, exactly as
|
||||
shown. A backslash followed by anything else just escapes the anything
|
||||
else. If the very last character is a backslash, it is ignored. This
|
||||
gives a way of passing an empty line as data, since a real empty line
|
||||
terminates the data input.
|
||||
The escapes that specify line ending sequences are literal strings,
|
||||
exactly as shown. No more than one newline setting should be present in
|
||||
any data line.
|
||||
|
||||
A backslash followed by anything else just escapes the anything else.
|
||||
If the very last character is a backslash, it is ignored. This gives a
|
||||
way of passing an empty line as data, since a real empty line termi-
|
||||
nates the data input.
|
||||
|
||||
If \M is present, pcretest calls pcre_exec() several times, with dif-
|
||||
ferent values in the match_limit and match_limit_recursion fields of
|
||||
@@ -335,38 +383,42 @@ DATA LINES
|
||||
The use of \x{hh...} to represent UTF-8 characters is not dependent on
|
||||
the use of the /8 modifier on the pattern. It is recognized always.
|
||||
There may be any number of hexadecimal digits inside the braces. The
|
||||
result is from one to six bytes, encoded according to the UTF-8 rules.
|
||||
result is from one to six bytes, encoded according to the original
|
||||
UTF-8 rules of RFC 2279. This allows for values in the range 0 to
|
||||
0x7FFFFFFF. Note that not all of those are valid Unicode code points,
|
||||
or indeed valid UTF-8 characters according to the later rules in RFC
|
||||
3629.
|
||||
|
||||
|
||||
THE ALTERNATIVE MATCHING FUNCTION
|
||||
|
||||
By default, pcretest uses the standard PCRE matching function,
|
||||
By default, pcretest uses the standard PCRE matching function,
|
||||
pcre_exec() to match each data line. From release 6.0, PCRE supports an
|
||||
alternative matching function, pcre_dfa_test(), which operates in a
|
||||
different way, and has some restrictions. The differences between the
|
||||
alternative matching function, pcre_dfa_test(), which operates in a
|
||||
different way, and has some restrictions. The differences between the
|
||||
two functions are described in the pcrematching documentation.
|
||||
|
||||
If a data line contains the \D escape sequence, or if the command line
|
||||
contains the -dfa option, the alternative matching function is called.
|
||||
If a data line contains the \D escape sequence, or if the command line
|
||||
contains the -dfa option, the alternative matching function is called.
|
||||
This function finds all possible matches at a given point. If, however,
|
||||
the \F escape sequence is present in the data line, it stops after the
|
||||
the \F escape sequence is present in the data line, it stops after the
|
||||
first match is found. This is always the shortest possible match.
|
||||
|
||||
|
||||
DEFAULT OUTPUT FROM PCRETEST
|
||||
|
||||
This section describes the output when the normal matching function,
|
||||
This section describes the output when the normal matching function,
|
||||
pcre_exec(), is being used.
|
||||
|
||||
When a match succeeds, pcretest outputs the list of captured substrings
|
||||
that pcre_exec() returns, starting with number 0 for the string that
|
||||
that pcre_exec() returns, starting with number 0 for the string that
|
||||
matched the whole pattern. Otherwise, it outputs "No match" or "Partial
|
||||
match" when pcre_exec() returns PCRE_ERROR_NOMATCH or PCRE_ERROR_PAR-
|
||||
TIAL, respectively, and otherwise the PCRE negative error number. Here
|
||||
match" when pcre_exec() returns PCRE_ERROR_NOMATCH or PCRE_ERROR_PAR-
|
||||
TIAL, respectively, and otherwise the PCRE negative error number. Here
|
||||
is an example of an interactive pcretest run.
|
||||
|
||||
$ pcretest
|
||||
PCRE version 5.00 07-Sep-2004
|
||||
PCRE version 7.0 30-Nov-2006
|
||||
|
||||
re> /^abc(\d+)/
|
||||
data> abc123
|
||||
@@ -375,18 +427,35 @@ DEFAULT OUTPUT FROM PCRETEST
|
||||
data> xyz
|
||||
No match
|
||||
|
||||
Note that unset capturing substrings that are not followed by one that
|
||||
is set are not returned by pcre_exec(), and are not shown by pcretest.
|
||||
In the following example, there are two capturing substrings, but when
|
||||
the first data line is matched, the second, unset substring is not
|
||||
shown. An "internal" unset substring is shown as "<unset>", as for the
|
||||
second data line.
|
||||
|
||||
re> /(a)|(b)/
|
||||
data> a
|
||||
0: a
|
||||
1: a
|
||||
data> b
|
||||
0: b
|
||||
1: <unset>
|
||||
2: b
|
||||
|
||||
If the strings contain any non-printing characters, they are output as
|
||||
\0x escapes, or as \x{...} escapes if the /8 modifier was present on
|
||||
the pattern. If the pattern has the /+ modifier, the output for sub-
|
||||
string 0 is followed by the the rest of the subject string, identified
|
||||
by "0+" like this:
|
||||
the pattern. See below for the definition of non-printing characters.
|
||||
If the pattern has the /+ modifier, the output for substring 0 is fol-
|
||||
lowed by the the rest of the subject string, identified by "0+" like
|
||||
this:
|
||||
|
||||
re> /cat/+
|
||||
data> cataract
|
||||
0: cat
|
||||
0+ aract
|
||||
|
||||
If the pattern has the /g or /G modifier, the results of successive
|
||||
If the pattern has the /g or /G modifier, the results of successive
|
||||
matching attempts are output in sequence, like this:
|
||||
|
||||
re> /\Bi(\w\w)/g
|
||||
@@ -400,24 +469,24 @@ DEFAULT OUTPUT FROM PCRETEST
|
||||
|
||||
"No match" is output only if the first match attempt fails.
|
||||
|
||||
If any of the sequences \C, \G, or \L are present in a data line that
|
||||
is successfully matched, the substrings extracted by the convenience
|
||||
If any of the sequences \C, \G, or \L are present in a data line that
|
||||
is successfully matched, the substrings extracted by the convenience
|
||||
functions are output with C, G, or L after the string number instead of
|
||||
a colon. This is in addition to the normal full list. The string length
|
||||
(that is, the return from the extraction function) is given in paren-
|
||||
(that is, the return from the extraction function) is given in paren-
|
||||
theses after each string for \C and \G.
|
||||
|
||||
Note that while patterns can be continued over several lines (a plain
|
||||
Note that whereas patterns can be continued over several lines (a plain
|
||||
">" prompt is used for continuations), data lines may not. However new-
|
||||
lines can be included in data by means of the \n escape (or \r or \r\n
|
||||
for those newline settings).
|
||||
lines can be included in data by means of the \n escape (or \r, \r\n,
|
||||
etc., depending on the newline sequence setting).
|
||||
|
||||
|
||||
OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
|
||||
|
||||
When the alternative matching function, pcre_dfa_exec(), is used (by
|
||||
means of the \D escape sequence or the -dfa command line option), the
|
||||
output consists of a list of all the matches that start at the first
|
||||
When the alternative matching function, pcre_dfa_exec(), is used (by
|
||||
means of the \D escape sequence or the -dfa command line option), the
|
||||
output consists of a list of all the matches that start at the first
|
||||
point in the subject where there is at least one match. For example:
|
||||
|
||||
re> /(tang|tangerine|tan)/
|
||||
@@ -426,11 +495,11 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
|
||||
1: tang
|
||||
2: tan
|
||||
|
||||
(Using the normal matching function on this data finds only "tang".)
|
||||
The longest matching string is always given first (and numbered zero).
|
||||
(Using the normal matching function on this data finds only "tang".)
|
||||
The longest matching string is always given first (and numbered zero).
|
||||
|
||||
If /gP is present on the pattern, the search for further matches
|
||||
resumes at the end of the longest match. For example:
|
||||
If /g is present on the pattern, the search for further matches resumes
|
||||
at the end of the longest match. For example:
|
||||
|
||||
re> /(tang|tangerine|tan)/g
|
||||
data> yellow tangerine and tangy sultana\D
|
||||
@@ -453,7 +522,7 @@ RESTARTING AFTER A PARTIAL MATCH
|
||||
can restart the match with additional subject data by means of the \R
|
||||
escape sequence. For example:
|
||||
|
||||
re> /^?(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)$/
|
||||
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||
data> 23ja\P\D
|
||||
Partial match: 23ja
|
||||
data> n05\R\D
|
||||
@@ -503,67 +572,88 @@ CALLOUTS
|
||||
the pcrecallout documentation.
|
||||
|
||||
|
||||
NON-PRINTING CHARACTERS
|
||||
|
||||
When pcretest is outputting text in the compiled version of a pattern,
|
||||
bytes other than 32-126 are always treated as non-printing characters
|
||||
are are therefore shown as hex escapes.
|
||||
|
||||
When pcretest is outputting text that is a matched part of a subject
|
||||
string, it behaves in the same way, unless a different locale has been
|
||||
set for the pattern (using the /L modifier). In this case, the
|
||||
isprint() function to distinguish printing and non-printing characters.
|
||||
|
||||
|
||||
SAVING AND RELOADING COMPILED PATTERNS
|
||||
|
||||
The facilities described in this section are not available when the
|
||||
The facilities described in this section are not available when the
|
||||
POSIX inteface to PCRE is being used, that is, when the /P pattern mod-
|
||||
ifier is specified.
|
||||
|
||||
When the POSIX interface is not in use, you can cause pcretest to write
|
||||
a compiled pattern to a file, by following the modifiers with > and a
|
||||
a compiled pattern to a file, by following the modifiers with > and a
|
||||
file name. For example:
|
||||
|
||||
/pattern/im >/some/file
|
||||
|
||||
See the pcreprecompile documentation for a discussion about saving and
|
||||
See the pcreprecompile documentation for a discussion about saving and
|
||||
re-using compiled patterns.
|
||||
|
||||
The data that is written is binary. The first eight bytes are the
|
||||
length of the compiled pattern data followed by the length of the
|
||||
optional study data, each written as four bytes in big-endian order
|
||||
(most significant byte first). If there is no study data (either the
|
||||
The data that is written is binary. The first eight bytes are the
|
||||
length of the compiled pattern data followed by the length of the
|
||||
optional study data, each written as four bytes in big-endian order
|
||||
(most significant byte first). If there is no study data (either the
|
||||
pattern was not studied, or studying did not return any data), the sec-
|
||||
ond length is zero. The lengths are followed by an exact copy of the
|
||||
ond length is zero. The lengths are followed by an exact copy of the
|
||||
compiled pattern. If there is additional study data, this follows imme-
|
||||
diately after the compiled pattern. After writing the file, pcretest
|
||||
diately after the compiled pattern. After writing the file, pcretest
|
||||
expects to read a new pattern.
|
||||
|
||||
A saved pattern can be reloaded into pcretest by specifing < and a file
|
||||
name instead of a pattern. The name of the file must not contain a <
|
||||
character, as otherwise pcretest will interpret the line as a pattern
|
||||
name instead of a pattern. The name of the file must not contain a <
|
||||
character, as otherwise pcretest will interpret the line as a pattern
|
||||
delimited by < characters. For example:
|
||||
|
||||
re> </some/file
|
||||
Compiled regex loaded from /some/file
|
||||
No study data
|
||||
|
||||
When the pattern has been loaded, pcretest proceeds to read data lines
|
||||
When the pattern has been loaded, pcretest proceeds to read data lines
|
||||
in the usual way.
|
||||
|
||||
You can copy a file written by pcretest to a different host and reload
|
||||
it there, even if the new host has opposite endianness to the one on
|
||||
which the pattern was compiled. For example, you can compile on an i86
|
||||
You can copy a file written by pcretest to a different host and reload
|
||||
it there, even if the new host has opposite endianness to the one on
|
||||
which the pattern was compiled. For example, you can compile on an i86
|
||||
machine and run on a SPARC machine.
|
||||
|
||||
File names for saving and reloading can be absolute or relative, but
|
||||
note that the shell facility of expanding a file name that starts with
|
||||
File names for saving and reloading can be absolute or relative, but
|
||||
note that the shell facility of expanding a file name that starts with
|
||||
a tilde (~) is not available.
|
||||
|
||||
The ability to save and reload files in pcretest is intended for test-
|
||||
ing and experimentation. It is not intended for production use because
|
||||
only a single pattern can be written to a file. Furthermore, there is
|
||||
no facility for supplying custom character tables for use with a
|
||||
reloaded pattern. If the original pattern was compiled with custom
|
||||
tables, an attempt to match a subject string using a reloaded pattern
|
||||
is likely to cause pcretest to crash. Finally, if you attempt to load
|
||||
The ability to save and reload files in pcretest is intended for test-
|
||||
ing and experimentation. It is not intended for production use because
|
||||
only a single pattern can be written to a file. Furthermore, there is
|
||||
no facility for supplying custom character tables for use with a
|
||||
reloaded pattern. If the original pattern was compiled with custom
|
||||
tables, an attempt to match a subject string using a reloaded pattern
|
||||
is likely to cause pcretest to crash. Finally, if you attempt to load
|
||||
a file that is not in the correct format, the result is undefined.
|
||||
|
||||
|
||||
SEE ALSO
|
||||
|
||||
pcre(3), pcreapi(3), pcrecallout(3), pcrematching(3), pcrepartial(d),
|
||||
pcrepattern(3), pcreprecompile(3).
|
||||
|
||||
|
||||
AUTHOR
|
||||
|
||||
Philip Hazel
|
||||
University Computing Service,
|
||||
Cambridge CB2 3QG, England.
|
||||
University Computing Service
|
||||
Cambridge CB2 3QH, England.
|
||||
|
||||
Last updated: 29 June 2006
|
||||
Copyright (c) 1997-2006 University of Cambridge.
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 10 March 2009
|
||||
Copyright (c) 1997-2009 University of Cambridge.
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user