@@ 1273,6 1273,8 @@ Lexer::parse_escape (char opening_char)
rust_error_at (get_current_location (),
"cannot have a unicode escape \\u in a byte %s",
opening_char == '\'' ? "character" : "string");
+ // Try to parse it anyway, just to skip it
+ parse_partial_unicode_escape ();
return std::make_tuple (output_char, additional_length_offset, false);
case '\r':
case '\n':
@@ 1461,16 1463,34 @@ Lexer::parse_partial_unicode_escape ()
{
skip_input ();
current_char = peek_input ();
- int additional_length_offset = 1;
+ int additional_length_offset = 0;
- bool need_close_brace = false;
- if (current_char == '{')
+ if (current_char != '{')
{
- need_close_brace = true;
+ rust_error_at (get_current_location (),
+ "unicode escape should start with %<{%>");
+ /* Skip what should probaby have been between brackets. */
+ while (is_x_digit (current_char) || current_char == '_')
+ {
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+ }
+ return std::make_pair (Codepoint (0), additional_length_offset);
+ }
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+
+ if (current_char == '_')
+ {
+ rust_error_at (get_current_location (),
+ "unicode escape cannot start with %<_%>");
skip_input ();
current_char = peek_input ();
additional_length_offset++;
+ // fallthrough and try to parse the rest anyway
}
// parse unicode escape - 1-6 hex digits
@@ 1500,21 1520,45 @@ Lexer::parse_partial_unicode_escape ()
current_char = peek_input ();
}
- // ensure closing brace if required
- if (need_close_brace)
+ if (current_char == '}')
{
- if (current_char == '}')
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+ }
+ else
+ {
+ // actually an error, but allow propagation anyway Assume that
+ // wrong bracketm whitespace or single/double quotes are wrong
+ // termination, otherwise it is a wrong character, then skip to the actual
+ // terminator.
+ if (current_char == '{' || is_whitespace (current_char)
+ || current_char == '\'' || current_char == '"')
{
- skip_input ();
- current_char = peek_input ();
- additional_length_offset++;
+ rust_error_at (get_current_location (),
+ "expected terminating %<}%> in unicode escape");
+ return std::make_pair (Codepoint (0), additional_length_offset);
}
else
{
- // actually an error, but allow propagation anyway
rust_error_at (get_current_location (),
- "expected terminating %<}%> in unicode escape");
- // return false;
+ "invalid character %<%c%> in unicode escape",
+ current_char);
+ while (current_char != '}' && current_char != '{'
+ && !is_whitespace (current_char) && current_char != '\''
+ && current_char != '"')
+ {
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+ }
+ // Consume the actual closing bracket if found
+ if (current_char == '}')
+ {
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+ }
return std::make_pair (Codepoint (0), additional_length_offset);
}
}
@@ 1530,10 1574,22 @@ Lexer::parse_partial_unicode_escape ()
return std::make_pair (Codepoint (0), additional_length_offset);
}
- long hex_num = std::strtol (num_str.c_str (), nullptr, 16);
+ unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16);
- // assert fits a uint32_t
- gcc_assert (hex_num < 4294967296);
+ if (hex_num > 0xd7ff && hex_num < 0xe000)
+ {
+ rust_error_at (
+ get_current_location (),
+ "unicode escape cannot be a surrogate value (D800 to DFFF)");
+ return std::make_pair (Codepoint (0), additional_length_offset);
+ }
+
+ if (hex_num > 0x10ffff)
+ {
+ rust_error_at (get_current_location (),
+ "unicode escape cannot be larger than 10FFFF");
+ return std::make_pair (Codepoint (0), additional_length_offset);
+ }
// return true;
return std::make_pair (Codepoint (static_cast<uint32_t> (hex_num)),
@@ 0,0 1,60 @@
+fn main ()
+{
+ // Braces are required
+ let _cbl = '\u013'; // { dg-error "unicode escape" }
+ let _sbl = "\u013"; //{ dg-error "unicode escape" }
+
+ // One to six hex digits
+ let _c0 = '\u{}'; // { dg-error "unicode escape" }
+ let _c1 = '\u{0}';
+ let _c2 = '\u{00}';
+ let _c3 = '\u{000}';
+ let _c4 = '\u{0000}';
+ let _c5 = '\u{00000}';
+ let _c6 = '\u{000000}';
+ let _c7 = '\u{0000000}'; // { dg-error "unicode escape" }
+
+ let _s0 = "\u{}"; // { dg-error "unicode escape" }
+ let _s1 = "\u{0}";
+ let _s2 = "\u{00}";
+ let _s3 = "\u{000}";
+ let _s4 = "\u{0000}";
+ let _s5 = "\u{00000}";
+ let _s6 = "\u{000000}";
+ let _s7 = "\u{0000000}"; // { dg-error "unicode escape" }
+
+ // Underscores OK except for start
+ let _c_ = '\u{00___01__0_1_}';
+ let _s_ = "\u{00___01__0_1_}";
+ let _c__ = '\u{_00__01__0_}'; // { dg-error "unicode escape" }
+ let _s__ = "\u{_00__01__0_}"; // { dg-error "unicode escape" }
+
+ // Must be hex chars
+ let _chex = '\u{hex}'; // { dg-error "unicode escape" }
+ let _shex = '\u{hex}'; // { dg-error "unicode escape" }
+
+ // Only valid from 0x0 to 0xD7FF and from 0xE000 to 0x10FFF
+ let _cd7ff = '\u{D7FF}';
+ let _sd7ff = "\u{D7FF}";
+ let _cd800 = '\u{D800}'; // { dg-error "unicode escape" }
+ let _sd800 = "\u{D800}"; // { dg-error "unicode escape" }
+
+ let _cdfff = '\u{DFFF}'; // { dg-error "unicode escape" }
+ let _sdfff = "\u{DFFF}"; // { dg-error "unicode escape" }
+ let _ce000 = '\u{E000}';
+ let _se000 = "\u{E000}";
+
+ let _clast = '\u{10FFFF}';
+ let _slast = "\u{10FFFF}";
+ let _clast1 = '\u{110000}'; // { dg-error "unicode escape" }
+ let _slast1 = "\u{110000}"; // { dg-error "unicode escape" }
+
+ let _cffffff = '\u{FFFFFF}'; // { dg-error "unicode escape" }
+ let _sffffff = "\u{FFFFFF}"; // { dg-error "unicode escape" }
+
+ // unicode escapes cannot be used in bytes or byte strings.
+ // Except in raw byte strings (where they aren't escapes).
+ let _bc = b'\u{000A}'; // { dg-error "unicode escape" }
+ let _bs = b"\u{000A}"; // { dg-error "unicode escape" }
+ let _rbs = br"\u{000A}";
+}