shebang handling

2021-07-04 Thread Mark Wielaard
Hi,

Shebang handling, the first line starting with #! was not done fully
correct and it isn't necessary to keep track of the shebang line in
the AST or HIR Crate classes.

Because an inner attribute also starts with #! the first line isn't
regarded as a shebang line if the #! is followed by (optional)
whitespace and comments and a [. In that case the #! is seen as the
start of an inner attribute.

I added various testcases that hopefully show the funny things you can
get when the first line starts with #!.

 [PATCH 1/2] Handle shebang line, plus any whitespace and comment
 [PATCH 2/2] Remove has_shebang flag from AST and HIR Crate classes

Patches can also be found here:
https://code.wildebeest.org/git/user/mjw/gccrs/log/?h=shebang

Cheers,

Mark
-- 
Gcc-rust mailing list
Gcc-rust@gcc.gnu.org
https://gcc.gnu.org/mailman/listinfo/gcc-rust


[PATCH 1/2] Handle shebang line, plus any whitespace and comment skipping in lexer

2021-07-04 Thread Mark Wielaard
The lexer tried to handle the shebang line but used loc directly,
instead of the current_column. And it assumed a '/' should immediately
follow the "#!". But if the "#!" is followed by whitespace and/or
comments and a '[' character, then the first line isn't see as a
shebang line (even if the kernel or shell would) but as the start of
an inner attribute.

Add various tests for when the first line starting with "#!" is seen
as a shebang line (and should be skipped). And some tests there is a
'[' character following some whitespace and/or comments and the "#!"
is seen as part of an inner attribute.
---
 gcc/rust/lex/rust-lex.cc  | 79 ++-
 .../rust/compile/torture/not_shebang.rs   |  3 +
 .../torture/not_shebang_block_comment.rs  |  1 +
 .../compile/torture/not_shebang_comment.rs|  3 +
 .../torture/not_shebang_multiline_comment.rs  |  7 ++
 .../compile/torture/not_shebang_spaces.rs |  6 ++
 gcc/testsuite/rust/compile/torture/shebang.rs |  3 +
 .../rust/compile/torture/shebang_plus_attr.rs |  3 +
 .../compile/torture/shebang_plus_attr2.rs |  3 +
 9 files changed, 89 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/rust/compile/torture/not_shebang.rs
 create mode 100644 
gcc/testsuite/rust/compile/torture/not_shebang_block_comment.rs
 create mode 100644 gcc/testsuite/rust/compile/torture/not_shebang_comment.rs
 create mode 100644 
gcc/testsuite/rust/compile/torture/not_shebang_multiline_comment.rs
 create mode 100644 gcc/testsuite/rust/compile/torture/not_shebang_spaces.rs
 create mode 100755 gcc/testsuite/rust/compile/torture/shebang.rs
 create mode 100755 gcc/testsuite/rust/compile/torture/shebang_plus_attr.rs
 create mode 100755 gcc/testsuite/rust/compile/torture/shebang_plus_attr2.rs

diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
index d1384168731..ebd69de0fd1 100644
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -237,28 +237,63 @@ Lexer::build_token ()
   current_char = peek_input ();
   skip_input ();
 
-  // return end of file token if end of file
-  if (current_char == EOF)
-   return Token::make (END_OF_FILE, loc);
-
   // detect shebang
-  if (loc == 1 && current_line == 1 && current_char == '#')
+  // Must be the first thing on the first line, starting with #!
+  // But since an attribute can also start with an #! we don't count it as 
a
+  // shebang line when after any whitespace or comments there is a [. If it
+  // is a shebang line we simple drop the line. Otherwise we don't consume
+  // any characters and fall through to the real tokenizer.
+  if (current_line == 1 && current_column == 1 && current_char == '#'
+ && peek_input () == '!')
{
- current_char = peek_input ();
-
- if (current_char == '!')
+ int n = 1;
+ while (true)
{
- skip_input ();
- current_char = peek_input ();
-
- if (current_char == '/')
+ int next_char = peek_input (n);
+ if (is_whitespace (next_char))
+   n++;
+ else if (next_char == '/' && peek_input (n + 1) == '/')
{
- // definitely shebang
-
- skip_input ();
-
- // ignore rest of line
- while (current_char != '\n')
+ // A single line comment
+ n += 2;
+ next_char = peek_input (n);
+ while (next_char != '\n' && next_char != EOF)
+   {
+ n++;
+ next_char = peek_input (n);
+   }
+ if (next_char == '\n')
+   n++;
+   }
+ else if (next_char == '/' && peek_input (n + 1) == '*')
+   {
+ // Start of a block comment
+ n += 2;
+ int level = 1;
+ while (level > 0)
+   {
+ if (peek_input (n) == EOF)
+   break;
+ else if (peek_input (n) == '/'
+  && peek_input (n + 1) == '*')
+   {
+ n += 2;
+ level += 1;
+   }
+ else if (peek_input (n) == '*'
+  && peek_input (n + 1) == '/')
+   {
+ n += 2;
+ level -= 1;
+   }
+ else
+   n++;
+   }
+   }
+ else if (next_char != '[')
+   {
+ // definitely shebang, ignore the first line
+ while (current_char != '\n' && current_char != EOF)
{
  current_char = peek_input ();
  skip_input ();
@@ -269,11 +304,17 @

[PATCH 2/2] Remove has_shebang flag from AST and HIR Crate classes

2021-07-04 Thread Mark Wielaard
The lexer deals with the shebang and the parser cannot detect whether
there is or isn't a shebang line. The flag isn't relevant or useful in
the AST and HIR Crate classes.
---
 gcc/rust/ast/rust-ast-full-test.cc  |  5 +
 gcc/rust/ast/rust-ast.h | 13 +
 gcc/rust/hir/rust-ast-lower.cc  |  3 +--
 gcc/rust/hir/tree/rust-hir-full-test.cc |  6 +-
 gcc/rust/hir/tree/rust-hir.h| 14 +-
 gcc/rust/parse/rust-parse-impl.h| 12 +---
 6 files changed, 18 insertions(+), 35 deletions(-)

diff --git a/gcc/rust/ast/rust-ast-full-test.cc 
b/gcc/rust/ast/rust-ast-full-test.cc
index 3d339ad1aed..12ef255bcbf 100644
--- a/gcc/rust/ast/rust-ast-full-test.cc
+++ b/gcc/rust/ast/rust-ast-full-test.cc
@@ -172,13 +172,10 @@ Crate::as_string () const
   rust_debug ("beginning crate recursive as-string");
 
   std::string str ("Crate: ");
-  // add utf8bom and shebang
+  // add utf8bom
   if (has_utf8bom)
 str += "\n has utf8bom";
 
-  if (has_shebang)
-str += "\n has shebang";
-
   // inner attributes
   str += append_attributes (inner_attrs, INNER);
 
diff --git a/gcc/rust/ast/rust-ast.h b/gcc/rust/ast/rust-ast.h
index 0e25de2be5c..ce55e1beb5e 100644
--- a/gcc/rust/ast/rust-ast.h
+++ b/gcc/rust/ast/rust-ast.h
@@ -1551,7 +1551,6 @@ protected:
 struct Crate
 {
   bool has_utf8bom;
-  bool has_shebang;
 
   std::vector inner_attrs;
   // dodgy spacing required here
@@ -1564,17 +1563,16 @@ struct Crate
 public:
   // Constructor
   Crate (std::vector > items,
-std::vector inner_attrs, bool has_utf8bom = false,
-bool has_shebang = false)
-: has_utf8bom (has_utf8bom), has_shebang (has_shebang),
-  inner_attrs (std::move (inner_attrs)), items (std::move (items)),
+std::vector inner_attrs, bool has_utf8bom = false)
+: has_utf8bom (has_utf8bom), inner_attrs (std::move (inner_attrs)),
+  items (std::move (items)),
   node_id (Analysis::Mappings::get ()->get_next_node_id ())
   {}
 
   // Copy constructor with vector clone
   Crate (Crate const &other)
-: has_utf8bom (other.has_utf8bom), has_shebang (other.has_shebang),
-  inner_attrs (other.inner_attrs), node_id (other.node_id)
+: has_utf8bom (other.has_utf8bom), inner_attrs (other.inner_attrs),
+  node_id (other.node_id)
   {
 items.reserve (other.items.size ());
 for (const auto &e : other.items)
@@ -1587,7 +1585,6 @@ public:
   Crate &operator= (Crate const &other)
   {
 inner_attrs = other.inner_attrs;
-has_shebang = other.has_shebang;
 has_utf8bom = other.has_utf8bom;
 node_id = other.node_id;
 
diff --git a/gcc/rust/hir/rust-ast-lower.cc b/gcc/rust/hir/rust-ast-lower.cc
index c7222e2e998..0f3c86dc7bf 100644
--- a/gcc/rust/hir/rust-ast-lower.cc
+++ b/gcc/rust/hir/rust-ast-lower.cc
@@ -41,7 +41,6 @@ ASTLowering::go ()
 {
   std::vector > items;
   bool has_utf8bom = false;
-  bool has_shebang = false;
 
   for (auto it = astCrate.items.begin (); it != astCrate.items.end (); it++)
 {
@@ -57,7 +56,7 @@ ASTLowering::go ()
 UNKNOWN_LOCAL_DEFID);
 
   return HIR::Crate (std::move (items), astCrate.get_inner_attrs (), mapping,
-has_utf8bom, has_shebang);
+has_utf8bom);
 }
 
 // rust-ast-lower-block.h
diff --git a/gcc/rust/hir/tree/rust-hir-full-test.cc 
b/gcc/rust/hir/tree/rust-hir-full-test.cc
index 261b3af672a..051ba8736ad 100644
--- a/gcc/rust/hir/tree/rust-hir-full-test.cc
+++ b/gcc/rust/hir/tree/rust-hir-full-test.cc
@@ -73,15 +73,11 @@ std::string
 Crate::as_string () const
 {
   std::string str ("HIR::Crate: ");
-  // add utf8bom and shebang
+  // add utf8bom
   if (has_utf8bom)
 {
   str += "\n has utf8bom";
 }
-  if (has_shebang)
-{
-  str += "\n has shebang";
-}
 
   // inner attributes
   str += "\n inner attributes: ";
diff --git a/gcc/rust/hir/tree/rust-hir.h b/gcc/rust/hir/tree/rust-hir.h
index 35dc71ae1df..f918f2dc106 100644
--- a/gcc/rust/hir/tree/rust-hir.h
+++ b/gcc/rust/hir/tree/rust-hir.h
@@ -679,7 +679,6 @@ public:
 struct Crate
 {
   bool has_utf8bom;
-  bool has_shebang;
 
   AST::AttrVec inner_attrs;
   // dodgy spacing required here
@@ -692,17 +691,15 @@ struct Crate
 public:
   // Constructor
   Crate (std::vector > items, AST::AttrVec inner_attrs,
-Analysis::NodeMapping mappings, bool has_utf8bom = false,
-bool has_shebang = false)
-: has_utf8bom (has_utf8bom), has_shebang (has_shebang),
-  inner_attrs (std::move (inner_attrs)), items (std::move (items)),
-  mappings (mappings)
+Analysis::NodeMapping mappings, bool has_utf8bom = false)
+: has_utf8bom (has_utf8bom), inner_attrs (std::move (inner_attrs)),
+  items (std::move (items)), mappings (mappings)
   {}
 
   // Copy constructor with vector clone
   Crate (Crate const &other)
-: has_utf8bom (other.has_utf8bom), has_shebang (other.has_shebang),
-  inner_attrs (other.inner_attrs), mappings (ot

Re: shebang handling

2021-07-04 Thread Marc via Gcc-rust
Mark Wielaard  writes:

>  [PATCH 1/2] Handle shebang line, plus any whitespace and comment
>  [PATCH 2/2] Remove has_shebang flag from AST and HIR Crate classes

Pushed to githup:
https://github.com/Rust-GCC/gccrs/pull/546

Marc
-- 
Gcc-rust mailing list
Gcc-rust@gcc.gnu.org
https://gcc.gnu.org/mailman/listinfo/gcc-rust