Blame - dist2/perltest.sh - platform/external/pcre

blob: 9cf7b17f73ba08086eeeffdf56560c88f30f5bce [file] [log] [blame]

Janis Danisevskis	112c9cc	2016-03-31 13:35:25 +0100	[diff] [blame^]	1	#! /bin/sh
				2
				3	# Script for testing regular expressions with perl to check that PCRE2 handles
				4	# them the same. The Perl code has to have "use utf8" and "require Encode" at
				5	# the start when running UTF-8 tests, but not for non-utf8 tests. (The
				6	# "require" would actually be OK for non-utf8-tests, but is not always
				7	# installed, so this way the script will always run for these tests.)
				8	#
				9	# The desired effect is achieved by making this a shell script that passes the
				10	# Perl script to Perl through a pipe. If the first argument is "-utf8", a
				11	# suitable prefix is set up.
				12	#
				13	# The remaining arguments, if any, are passed to Perl. They are an input file
				14	# and an output file. If there is one argument, the output is written to
				15	# STDOUT. If Perl receives no arguments, it opens /dev/tty as input, and writes
				16	# output to STDOUT. (I haven't found a way of getting it to use STDIN, because
				17	# of the contorted piping input.)
				18
				19	perl=perl
				20	prefix=''
				21	if [ $# -gt 0 -a "$1" = "-utf8" ] ; then
				22	prefix="use utf8; require Encode;"
				23	shift
				24	fi
				25
				26
				27	# The Perl script that follows has a similar specification to pcre2test, and so
				28	# can be given identical input, except that input patterns can be followed only
				29	# by Perl's lower case modifiers and certain other pcre2test modifiers that are
				30	# either handled or ignored:
				31	#
				32	# aftertext interpreted as "print $' afterwards"
				33	# afteralltext ignored
				34	# dupnames ignored (Perl always allows)
				35	# mark ignored
				36	# no_auto_possess ignored
				37	# no_start_optimize ignored
				38	# ucp sets Perl's /u modifier
				39	# utf invoke UTF-8 functionality
				40	#
				41	# The data lines must not have any pcre2test modifiers. They are processed as
				42	# Perl double-quoted strings, so if they contain " $ or @ characters, these
				43	# have to be escaped. For this reason, all such characters in the
				44	# Perl-compatible testinput1 and testinput4 files are escaped so that they can
				45	# be used for perltest as well as for pcre2test. The output from this script
				46	# should be same as from pcre2test, apart from the initial identifying banner.
				47	#
				48	# The other testinput files are not suitable for feeding to perltest.sh,
				49	# because they make use of the special modifiers that pcre2test uses for
				50	# testing features of PCRE2. Some of these files also contain malformed regular
				51	# expressions, in order to check that PCRE2 diagnoses them correctly.
				52
				53	(echo "$prefix" ; cat <<'PERLEND'
				54
				55	# Function for turning a string into a string of printing chars.
				56
				57	sub pchars {
				58	my($t) = "";
				59	if ($utf8)
				60	{
				61	@p = unpack('U*', $_[0]);
				62	foreach $c (@p)
				63	{
				64	if ($c >= 32 && $c < 127) { $t .= chr $c; }
				65	else { $t .= sprintf("\\x{%02x}", $c);
				66	}
				67	}
				68	}
				69	else
				70	{
				71	foreach $c (split(//, $_[0]))
				72	{
				73	if (ord $c >= 32 && ord $c < 127) { $t .= $c; }
				74	else { $t .= sprintf("\\x%02x", ord $c); }
				75	}
				76	}
				77	$t;
				78	}
				79
				80
				81	# Read lines from a named file or stdin and write to a named file or stdout;
				82	# lines consist of a regular expression, in delimiters and optionally followed
				83	# by options, followed by a set of test data, terminated by an empty line.
				84
				85	# Sort out the input and output files
				86
				87	if (@ARGV > 0)
				88	{
				89	open(INFILE, "<$ARGV[0]") \|\| die "Failed to open $ARGV[0]\n";
				90	$infile = "INFILE";
				91	$interact = 0;
				92	}
				93	else
				94	{
				95	open(INFILE, "</dev/tty") \|\| die "Failed to open /dev/tty\n";
				96	$infile = "INFILE";
				97	$interact = 1;
				98	}
				99
				100	if (@ARGV > 1)
				101	{
				102	open(OUTFILE, ">$ARGV[1]") \|\| die "Failed to open $ARGV[1]\n";
				103	$outfile = "OUTFILE";
				104	}
				105	else { $outfile = "STDOUT"; }
				106
				107	printf($outfile "Perl $] Regular Expressions\n\n");
				108
				109	# Main loop
				110
				111	NEXT_RE:
				112	for (;;)
				113	{
				114	printf " re> " if $interact;
				115	last if ! ($_ = <$infile>);
				116	printf $outfile "$_" if ! $interact;
				117	next if ($_ =~ /^\s*$/ \|\| $_ =~ /^#/);
				118
				119	$pattern = $_;
				120
				121	while ($pattern !~ /^\s(.).\1/s)
				122	{
				123	printf " > " if $interact;
				124	last if ! ($_ = <$infile>);
				125	printf $outfile "$_" if ! $interact;
				126	$pattern .= $_;
				127	}
				128
				129	chomp($pattern);
				130	$pattern =~ s/\s+$//;
				131
				132	# Split the pattern from the modifiers and adjust them as necessary.
				133
				134	$pattern =~ /^\s((.).\2)(.*)$/s;
				135	$pat = $1;
				136	$mod = $3;
				137
				138	# The private "aftertext" modifier means "print $' afterwards".
				139
				140	$showrest = ($mod =~ s/aftertext,?//);
				141
				142	# "allaftertext" is used by pcre2test to print remainders after captures
				143
				144	$mod =~ s/allaftertext,?//;
				145
				146	# Detect utf
				147
				148	$utf8 = $mod =~ s/utf,?//;
				149
				150	# Remove "dupnames".
				151
				152	$mod =~ s/dupnames,?//;
				153
				154	# Remove "mark" (asks pcre2test to check MARK data) */
				155
				156	$mod =~ s/mark,?//;
				157
				158	# "ucp" asks pcre2test to set PCRE2_UCP; change this to /u for Perl
				159
				160	$mod =~ s/ucp,?/u/;
				161
				162	# Remove "no_auto_possess" and "no_start_optimize" (disable PCRE2 optimizations)
				163
				164	$mod =~ s/no_auto_possess,?//;
				165	$mod =~ s/no_start_optimize,?//;
				166
				167	# Add back retained modifiers and check that the pattern is valid.
				168
				169	$mod =~ s/,//g;
				170	$pattern = "$pat$mod";
				171	eval "\$_ =~ ${pattern}";
				172	if ($@)
				173	{
				174	printf $outfile "Error: $@";
				175	if (! $interact)
				176	{
				177	for (;;)
				178	{
				179	last if ! ($_ = <$infile>);
				180	last if $_ =~ /^\s*$/;
				181	}
				182	}
				183	next NEXT_RE;
				184	}
				185
				186	# If the /g modifier is present, we want to put a loop round the matching;
				187	# otherwise just a single "if".
				188
				189	$cmd = ($pattern =~ /g[a-z]*$/)? "while" : "if";
				190
				191	# If the pattern is actually the null string, Perl uses the most recently
				192	# executed (and successfully compiled) regex is used instead. This is a
				193	# nasty trap for the unwary! The PCRE2 test suite does contain null strings
				194	# in places - if they are allowed through here all sorts of weird and
				195	# unexpected effects happen. To avoid this, we replace such patterns with
				196	# a non-null pattern that has the same effect.
				197
				198	$pattern = "/(?#)/$2" if ($pattern =~ /^(.)\1(.*)$/);
				199
				200	# Read data lines and test them
				201
				202	for (;;)
				203	{
				204	printf "data> " if $interact;
				205	last NEXT_RE if ! ($_ = <$infile>);
				206	chomp;
				207	printf $outfile "%s", "$_\n" if ! $interact;
				208
				209	s/\s+$//; # Remove trailing space
				210	s/^\s+//; # Remove leading space
				211
				212	last if ($_ eq "");
				213	next if $_ =~ /^\\=(?:\s\|$)/; # Comment line
				214
				215	$x = eval "\"$_\""; # To get escapes processed
				216
				217	# Empty array for holding results, ensure $REGERROR and $REGMARK are
				218	# unset, then do the matching.
				219
				220	@subs = ();
				221
				222	$pushes = "push \@subs,\$&;" .
				223	"push \@subs,\$1;" .
				224	"push \@subs,\$2;" .
				225	"push \@subs,\$3;" .
				226	"push \@subs,\$4;" .
				227	"push \@subs,\$5;" .
				228	"push \@subs,\$6;" .
				229	"push \@subs,\$7;" .
				230	"push \@subs,\$8;" .
				231	"push \@subs,\$9;" .
				232	"push \@subs,\$10;" .
				233	"push \@subs,\$11;" .
				234	"push \@subs,\$12;" .
				235	"push \@subs,\$13;" .
				236	"push \@subs,\$14;" .
				237	"push \@subs,\$15;" .
				238	"push \@subs,\$16;" .
				239	"push \@subs,\$'; }";
				240
				241	undef $REGERROR;
				242	undef $REGMARK;
				243
				244	eval "${cmd} (\$x =~ ${pattern}) {" . $pushes;
				245
				246	if ($@)
				247	{
				248	printf $outfile "Error: $@\n";
				249	next NEXT_RE;
				250	}
				251	elsif (scalar(@subs) == 0)
				252	{
				253	printf $outfile "No match";
				254	if (defined $REGERROR && $REGERROR != 1)
				255	{ printf $outfile (", mark = %s", &pchars($REGERROR)); }
				256	printf $outfile "\n";
				257	}
				258	else
				259	{
				260	while (scalar(@subs) != 0)
				261	{
				262	printf $outfile (" 0: %s\n", &pchars($subs[0]));
				263	printf $outfile (" 0+ %s\n", &pchars($subs[17])) if $showrest;
				264	$last_printed = 0;
				265	for ($i = 1; $i <= 16; $i++)
				266	{
				267	if (defined $subs[$i])
				268	{
				269	while ($last_printed++ < $i-1)
				270	{ printf $outfile ("%2d: <unset>\n", $last_printed); }
				271	printf $outfile ("%2d: %s\n", $i, &pchars($subs[$i]));
				272	$last_printed = $i;
				273	}
				274	}
				275	splice(@subs, 0, 18);
				276	}
				277
				278	# It seems that $REGMARK is not marked as UTF-8 even when use utf8 is
				279	# set and the input pattern was a UTF-8 string. We can, however, force
				280	# it to be so marked.
				281
				282	if (defined $REGMARK && $REGMARK != 1)
				283	{
				284	$xx = $REGMARK;
				285	$xx = Encode::decode_utf8($xx) if $utf8;
				286	printf $outfile ("MK: %s\n", &pchars($xx));
				287	}
				288	}
				289	}
				290	}
				291
				292	# printf $outfile "\n";
				293
				294	PERLEND
				295	) \| $perl - $@
				296
				297	# End