UTF-8 BOM handling

2021-07-05 Thread Mark Wielaard
Hi,

A rust source file can start with a UTF-8 BOM sequence (EF BB
BF). This simply indicates that the file is encoded as UTF-8 (all rust
input is interpreted as asequence of Unicode code points encoded in
UTF-8) so can be skipped before starting real lexing.

It isn't necessary to keep track of the BOM in the AST or HIR Crate
classes. So I removed the has_utf8bom flag.

Also included are a couple of simple tests to show we handle the BOM
correctly now.

 [PATCH 1/2] Handle UTF-8 BOM in lexer
 [PATCH 2/2] Remove has_utf8bom flag from AST and HIR Crate classes

Cheers,

Mark
-- 
Gcc-rust mailing list
Gcc-rust@gcc.gnu.org
https://gcc.gnu.org/mailman/listinfo/gcc-rust


[PATCH 1/2] Handle UTF-8 BOM in lexer

2021-07-05 Thread Mark Wielaard
The very first thing in a rust source file might be the optional UTF-8
BOM.  This is the 3 bytes 0xEF 0xBB 0xBF. They can simply be skipped,
they just mark the file as UTF-8.  Add some testcases to show we now
handle such files.
---
 gcc/rust/lex/rust-lex.cc| 13 +
 gcc/testsuite/rust/compile/torture/bom.rs   |  1 +
 gcc/testsuite/rust/compile/torture/bom_comment.rs   |  2 ++
 gcc/testsuite/rust/compile/torture/bom_shebang.rs   |  2 ++
 .../rust/compile/torture/bom_whitespace.rs  |  2 ++
 5 files changed, 20 insertions(+)
 create mode 100644 gcc/testsuite/rust/compile/torture/bom.rs
 create mode 100644 gcc/testsuite/rust/compile/torture/bom_comment.rs
 create mode 100644 gcc/testsuite/rust/compile/torture/bom_shebang.rs
 create mode 100644 gcc/testsuite/rust/compile/torture/bom_whitespace.rs

diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
index ebd69de0fd1..617dd69a080 100644
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -237,6 +237,19 @@ Lexer::build_token ()
   current_char = peek_input ();
   skip_input ();
 
+  // detect UTF8 bom
+  //
+  // Must be the first thing on the first line.
+  // There might be an optional BOM (Byte Order Mark), which for UTF-8 is
+  // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
+  if (current_line == 1 && current_column == 1 && current_char == 0xef
+ && peek_input () == 0xbb && peek_input (1) == 0xbf)
+   {
+ skip_input (1);
+ current_char = peek_input ();
+ skip_input ();
+   }
+
   // detect shebang
   // Must be the first thing on the first line, starting with #!
   // But since an attribute can also start with an #! we don't count it as 
a
diff --git a/gcc/testsuite/rust/compile/torture/bom.rs 
b/gcc/testsuite/rust/compile/torture/bom.rs
new file mode 100644
index 000..5edcab227ee
--- /dev/null
+++ b/gcc/testsuite/rust/compile/torture/bom.rs
@@ -0,0 +1 @@
+pub fn main () { }
diff --git a/gcc/testsuite/rust/compile/torture/bom_comment.rs 
b/gcc/testsuite/rust/compile/torture/bom_comment.rs
new file mode 100644
index 000..020e1707b55
--- /dev/null
+++ b/gcc/testsuite/rust/compile/torture/bom_comment.rs
@@ -0,0 +1,2 @@
+// UTF8 BOM
+pub fn main () { }
diff --git a/gcc/testsuite/rust/compile/torture/bom_shebang.rs 
b/gcc/testsuite/rust/compile/torture/bom_shebang.rs
new file mode 100644
index 000..4c552e8d71d
--- /dev/null
+++ b/gcc/testsuite/rust/compile/torture/bom_shebang.rs
@@ -0,0 +1,2 @@
+#!/usr/bin/cat
+pub fn main () { }
diff --git a/gcc/testsuite/rust/compile/torture/bom_whitespace.rs 
b/gcc/testsuite/rust/compile/torture/bom_whitespace.rs
new file mode 100644
index 000..b10d5654473
--- /dev/null
+++ b/gcc/testsuite/rust/compile/torture/bom_whitespace.rs
@@ -0,0 +1,2 @@
+
+pub fn main () { }
-- 
2.32.0

-- 
Gcc-rust mailing list
Gcc-rust@gcc.gnu.org
https://gcc.gnu.org/mailman/listinfo/gcc-rust


[PATCH 2/2] Remove has_utf8bom flag from AST and HIR Crate classes

2021-07-05 Thread Mark Wielaard
The lexer deals with the UTF-8 BOM and the parser cannot detect
whether there is or isn't a BOM at the start of a file. The flag isn't
relevant or useful in the AST and HIR Crate classes.
---
 gcc/rust/ast/rust-ast-full-test.cc  |  3 ---
 gcc/rust/ast/rust-ast.h | 11 +++
 gcc/rust/hir/rust-ast-lower.cc  |  4 +---
 gcc/rust/hir/tree/rust-hir-full-test.cc |  5 -
 gcc/rust/hir/tree/rust-hir.h| 12 
 gcc/rust/parse/rust-parse-impl.h|  8 +---
 6 files changed, 9 insertions(+), 34 deletions(-)

diff --git a/gcc/rust/ast/rust-ast-full-test.cc 
b/gcc/rust/ast/rust-ast-full-test.cc
index 12ef255bcbf..dd55e1ddbd2 100644
--- a/gcc/rust/ast/rust-ast-full-test.cc
+++ b/gcc/rust/ast/rust-ast-full-test.cc
@@ -172,9 +172,6 @@ Crate::as_string () const
   rust_debug ("beginning crate recursive as-string");
 
   std::string str ("Crate: ");
-  // add utf8bom
-  if (has_utf8bom)
-str += "\n has utf8bom";
 
   // inner attributes
   str += append_attributes (inner_attrs, INNER);
diff --git a/gcc/rust/ast/rust-ast.h b/gcc/rust/ast/rust-ast.h
index ce55e1beb5e..75b08f8aa66 100644
--- a/gcc/rust/ast/rust-ast.h
+++ b/gcc/rust/ast/rust-ast.h
@@ -1550,8 +1550,6 @@ protected:
 // A crate AST object - holds all the data for a single compilation unit
 struct Crate
 {
-  bool has_utf8bom;
-
   std::vector inner_attrs;
   // dodgy spacing required here
   /* TODO: is it better to have a vector of items here or a module (implicit
@@ -1563,16 +1561,14 @@ struct Crate
 public:
   // Constructor
   Crate (std::vector > items,
-std::vector inner_attrs, bool has_utf8bom = false)
-: has_utf8bom (has_utf8bom), inner_attrs (std::move (inner_attrs)),
-  items (std::move (items)),
+std::vector inner_attrs)
+: inner_attrs (std::move (inner_attrs)), items (std::move (items)),
   node_id (Analysis::Mappings::get ()->get_next_node_id ())
   {}
 
   // Copy constructor with vector clone
   Crate (Crate const &other)
-: has_utf8bom (other.has_utf8bom), inner_attrs (other.inner_attrs),
-  node_id (other.node_id)
+: inner_attrs (other.inner_attrs), node_id (other.node_id)
   {
 items.reserve (other.items.size ());
 for (const auto &e : other.items)
@@ -1585,7 +1581,6 @@ public:
   Crate &operator= (Crate const &other)
   {
 inner_attrs = other.inner_attrs;
-has_utf8bom = other.has_utf8bom;
 node_id = other.node_id;
 
 items.reserve (other.items.size ());
diff --git a/gcc/rust/hir/rust-ast-lower.cc b/gcc/rust/hir/rust-ast-lower.cc
index 0f3c86dc7bf..01abd84627b 100644
--- a/gcc/rust/hir/rust-ast-lower.cc
+++ b/gcc/rust/hir/rust-ast-lower.cc
@@ -40,7 +40,6 @@ HIR::Crate
 ASTLowering::go ()
 {
   std::vector > items;
-  bool has_utf8bom = false;
 
   for (auto it = astCrate.items.begin (); it != astCrate.items.end (); it++)
 {
@@ -55,8 +54,7 @@ ASTLowering::go ()
 mappings->get_next_hir_id (crate_num),
 UNKNOWN_LOCAL_DEFID);
 
-  return HIR::Crate (std::move (items), astCrate.get_inner_attrs (), mapping,
-has_utf8bom);
+  return HIR::Crate (std::move (items), astCrate.get_inner_attrs (), mapping);
 }
 
 // rust-ast-lower-block.h
diff --git a/gcc/rust/hir/tree/rust-hir-full-test.cc 
b/gcc/rust/hir/tree/rust-hir-full-test.cc
index 051ba8736ad..05c75e06403 100644
--- a/gcc/rust/hir/tree/rust-hir-full-test.cc
+++ b/gcc/rust/hir/tree/rust-hir-full-test.cc
@@ -73,11 +73,6 @@ std::string
 Crate::as_string () const
 {
   std::string str ("HIR::Crate: ");
-  // add utf8bom
-  if (has_utf8bom)
-{
-  str += "\n has utf8bom";
-}
 
   // inner attributes
   str += "\n inner attributes: ";
diff --git a/gcc/rust/hir/tree/rust-hir.h b/gcc/rust/hir/tree/rust-hir.h
index f918f2dc106..1819d17b585 100644
--- a/gcc/rust/hir/tree/rust-hir.h
+++ b/gcc/rust/hir/tree/rust-hir.h
@@ -678,8 +678,6 @@ public:
 // A crate HIR object - holds all the data for a single compilation unit
 struct Crate
 {
-  bool has_utf8bom;
-
   AST::AttrVec inner_attrs;
   // dodgy spacing required here
   /* TODO: is it better to have a vector of items here or a module (implicit
@@ -691,15 +689,14 @@ struct Crate
 public:
   // Constructor
   Crate (std::vector > items, AST::AttrVec inner_attrs,
-Analysis::NodeMapping mappings, bool has_utf8bom = false)
-: has_utf8bom (has_utf8bom), inner_attrs (std::move (inner_attrs)),
-  items (std::move (items)), mappings (mappings)
+Analysis::NodeMapping mappings)
+: inner_attrs (std::move (inner_attrs)), items (std::move (items)),
+  mappings (mappings)
   {}
 
   // Copy constructor with vector clone
   Crate (Crate const &other)
-: has_utf8bom (other.has_utf8bom), inner_attrs (other.inner_attrs),
-  mappings (other.mappings)
+: inner_attrs (other.inner_attrs), mappings (other.mappings)
   {
 items.reserve (other.items.size ());
 for (const auto &e : other.items)
@@ -712,7