It is now possible to use the (?:...) syntax to group elements in
a regular expression without making their matched substring available
in the Match_Array.
The following test must output:
Matched (0)= 1.. 6
Matched (1)= 5.. 5
Matched (2)= 0.. 0
with GNAT.Regpat; use GNAT.Regpat;
with Ada.Text_IO; use Ada.Text_IO;
procedure Main is
P : constant Pattern_Matcher := Compile ("ab(?:a*)(a+)b");
M : Match_Array (0 .. 3);
begin
Match (P, "abaaab", M);
if M (0) /= No_Match then
Put_Line ("Matched (0)=" & M (0).First'Img & ".." & M (0).Last'Img);
Put_Line ("Matched (1)=" & M (1).First'Img & ".." & M (1).Last'Img);
Put_Line ("Matched (2)=" & M (2).First'Img & ".." & M (2).Last'Img);
end if;
end Main;
Tested on x86_64-pc-linux-gnu, committed on trunk
2014-08-01 Emmanuel Briot <[email protected]>
* g-regpat.adb (Parse): Add support for non-capturing parenthesis.
Index: s-regpat.adb
===================================================================
--- s-regpat.adb (revision 213263)
+++ s-regpat.adb (working copy)
@@ -7,7 +7,7 @@
-- B o d y --
-- --
-- Copyright (C) 1986 by University of Toronto. --
--- Copyright (C) 1999-2013, AdaCore --
+-- Copyright (C) 1999-2014, AdaCore --
-- --
-- GNAT is free software; you can redistribute it and/or modify it under --
-- terms of the GNU General Public License as published by the Free Soft- --
@@ -410,10 +410,13 @@
procedure Parse
(Parenthesized : Boolean;
+ Capturing : Boolean;
Flags : out Expression_Flags;
IP : out Pointer);
-- Parse regular expression, i.e. main body or parenthesized thing
-- Caller must absorb opening parenthesis.
+ -- Capturing should be set to True when we have an open parenthesis
+ -- from which we want the user to extra text.
procedure Parse_Branch
(Flags : out Expression_Flags;
@@ -831,9 +834,10 @@
-- the branches to what follows makes it hard to avoid.
procedure Parse
- (Parenthesized : Boolean;
- Flags : out Expression_Flags;
- IP : out Pointer)
+ (Parenthesized : Boolean;
+ Capturing : Boolean;
+ Flags : out Expression_Flags;
+ IP : out Pointer)
is
E : String renames Expression;
Br, Br2 : Pointer;
@@ -847,7 +851,7 @@
-- Make an OPEN node, if parenthesized
- if Parenthesized then
+ if Parenthesized and then Capturing then
if Matcher.Paren_Count > Max_Paren_Count then
Fail ("too many ()");
end if;
@@ -856,7 +860,6 @@
Matcher.Paren_Count := Matcher.Paren_Count + 1;
IP := Emit_Node (OPEN);
Emit (Character'Val (Par_No));
-
else
IP := 0;
Par_No := 0;
@@ -913,14 +916,19 @@
-- Make a closing node, and hook it on the end
if Parenthesized then
- Ender := Emit_Node (CLOSE);
- Emit (Character'Val (Par_No));
+ if Capturing then
+ Ender := Emit_Node (CLOSE);
+ Emit (Character'Val (Par_No));
+ Link_Tail (IP, Ender);
+ else
+ -- need to keep looking after the closing parenthesis
+ null;
+ end if;
else
Ender := Emit_Node (EOP);
+ Link_Tail (IP, Ender);
end if;
- Link_Tail (IP, Ender);
-
if Have_Branch and then Emit_Ptr <= PM.Size + 1 then
-- Hook the tails of the branches to the closing node
@@ -945,7 +953,7 @@
elsif Parse_Pos <= Parse_End then
if E (Parse_Pos) = ')' then
- Fail ("unmatched ()");
+ Fail ("unmatched ')'");
else
Fail ("junk on end"); -- "Can't happen"
end if;
@@ -1003,16 +1011,24 @@
New_Flags : Expression_Flags;
begin
- Parse (True, New_Flags, IP);
-
- if IP = 0 then
- return;
+ if Parse_Pos <= Parse_End - 1
+ and then Expression (Parse_Pos) = '?'
+ and then Expression (Parse_Pos + 1) = ':'
+ then
+ Parse_Pos := Parse_Pos + 2;
+ -- non-capturing parenthesis
+ Parse (True, False, New_Flags, IP);
+ else
+ -- capturing parenthesis
+ Parse (True, True, New_Flags, IP);
+ Expr_Flags.Has_Width :=
+ Expr_Flags.Has_Width or else New_Flags.Has_Width;
+ Expr_Flags.SP_Start :=
+ Expr_Flags.SP_Start or else New_Flags.SP_Start;
+ if IP = 0 then
+ return;
+ end if;
end if;
-
- Expr_Flags.Has_Width :=
- Expr_Flags.Has_Width or else New_Flags.Has_Width;
- Expr_Flags.SP_Start :=
- Expr_Flags.SP_Start or else New_Flags.SP_Start;
end;
when '|' | ASCII.LF | ')' =>
@@ -1971,7 +1987,7 @@
-- Start of processing for Compile
begin
- Parse (False, Expr_Flags, Result);
+ Parse (False, False, Expr_Flags, Result);
if Result = 0 then
Fail ("Couldn't compile expression");
Index: s-regpat.ads
===================================================================
--- s-regpat.ads (revision 213263)
+++ s-regpat.ads (working copy)
@@ -7,7 +7,7 @@
-- S p e c --
-- --
-- Copyright (C) 1986 by University of Toronto. --
--- Copyright (C) 1996-2010, AdaCore --
+-- Copyright (C) 1996-2014, AdaCore --
-- --
-- GNAT is free software; you can redistribute it and/or modify it under --
-- terms of the GNU General Public License as published by the Free Soft- --
@@ -78,8 +78,10 @@
-- ::= [^ range range ...] -- matches any character not listed
-- ::= . -- matches any single character
-- -- except newlines
- -- ::= ( expr ) -- parens used for grouping
- -- ::= \ num -- reference to num-th parenthesis
+ -- ::= ( expr ) -- parenthesis used for grouping
+ -- ::= (?: expr ) -- non-capturing parenthesis
+ -- ::= \ num -- reference to num-th capturing
+ -- parenthesis
-- range ::= char - char -- matches chars in given range
-- ::= nchr
@@ -345,6 +347,9 @@
-- N'th parenthesized subexpressions; Matches (0) is for the whole
-- expression.
--
+ -- Non-capturing parenthesis (introduced with (?:...)) can not be
+ -- retrieved and do not count in the match array index.
+ --
-- For instance, if your regular expression is: "a((b*)c+)(d+)", then
-- 12 3
-- Matches (0) is for "a((b*)c+)(d+)" (the entire expression)