@@ 97,33 97,54 @@ pub const Tokenizer = struct {
}
}
+ pub fn isNextSymbolId(tokenizer: *Tokenizer, symbol_id: Symbol.Id) bool {
+ return tokenizer.nextSymbolId() == symbol_id;
+ }
+
/// get the text contents of the next word, if the next symbol is a token,
/// and it's a valid word.
pub fn peekWord(tokenizer: *Tokenizer) ?[]const u8 {
if (!tokenizer.isNextSymbolId(.Token)) return null;
- // a symbol of type 'Token' should always have at least one
- // character left.
- var word_length: u8 = 1;
+ var word_length: usize = 0;
while (tokenizer.text.hasRemaining(word_length + 1)) {
const char = tokenizer.text.at(word_length);
switch (char) {
- '\n', ')' => return tokenizer.text.peek(word_length),
+ '\n', ')' => break,
'$', '`', '\'', '"', '\\' => return null,
else => {},
}
if (isOperatorStart(char) or std.ascii.isBlank(char)) {
- return tokenizer.text.peek(word_length);
+ break;
}
word_length += 1;
}
- return tokenizer.text.peek(word_length);
+ return if (word_length > 1) tokenizer.text.peek(word_length) else null;
}
- pub fn isNextSymbolId(tokenizer: *Tokenizer, symbol_id: Symbol.Id) bool {
- return tokenizer.nextSymbolId() == symbol_id;
+ // see: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_235
+
+ /// Peek at a single shell name. A name is a word consisting solely of
+ /// underscores, digits, and alphabetics from the portable character set,
+ /// and the first character is not a digit.
+ pub fn peekName(tokenizer: *Tokenizer) ?[]const u8 {
+ if (!tokenizer.isNextSymbolId(.Token)) return null;
+ // text must have a character left if the next symbol is a token
+ const first_char = tokenizer.text.peekChar().?;
+ if (std.ascii.isDigit(first_char)) return null;
+ if (first_char != '_' and !std.ascii.isAlNum(first_char)) return null;
+
+ var word_length: usize = 1;
+ while (tokenizer.text.hasRemaining(word_length + 1)) {
+ const char = tokenizer.text.at(word_length);
+ if (char != '_' and !std.ascii.isAlNum(char)) {
+ break;
+ }
+ word_length += 1;
+ }
+ return tokenizer.text.peek(word_length);
}
/// Consume the next token, if it exists, and matches token_text. If it
@@ 280,6 301,35 @@ test "Tokenizer.peekWord" {
_ = text.read(3);
t.expectEqualSlices(u8, "Lines", tokenizer.peekWord().?);
_ = text.read(6);
+ try text.append("inval$id");
+ t.expect(null == tokenizer.peekWord());
+}
+
+test "Tokenizer.peekName" {
+ const t = std.testing;
+
+ var text = try TextBuffer.init(std.heap.direct_allocator, "Some names");
+ defer text.deinit();
+
+ var tokenizer = Tokenizer.init(&text);
+ defer tokenizer.deinit();
+ t.expectEqualSlices(u8, "Some", tokenizer.peekName().?);
+ t.expectEqualSlices(u8, "Some", tokenizer.peekName().?);
+ _ = text.read(4);
+ t.expectEqualSlices(u8, "names", tokenizer.peekName().?);
+ t.expectEqualSlices(u8, "names", tokenizer.peekName().?);
+ _ = text.read(5);
+ t.expect(null == tokenizer.peekName());
+ try text.append("delimit%with@non-alpha_numerics");
+ t.expectEqualSlices(u8, "delimit", tokenizer.peekName().?);
+ _ = text.read(8);
+ t.expectEqualSlices(u8, "with", tokenizer.peekName().?);
+ _ = text.read(4);
+ t.expect(null == tokenizer.peekName());
+ _ = text.readChar();
+ t.expectEqualSlices(u8, "non", tokenizer.peekName().?);
+ _ = text.read(4);
+ t.expectEqualSlices(u8, "alpha_numerics", tokenizer.peekName().?);
}
test "Tokenizer.readNextToken" {
@@ 321,7 371,7 @@ test "Tokenizer.readNextToken" {
t.expectEqual(
TextBuffer.Range{
.start = TextBuffer.Pos{ .offset = 23, .line = 2, .column = 12 },
- .end = TextBuffer.Pos{ .offset = 27, .line = 2, .column = 16},
+ .end = TextBuffer.Pos{ .offset = 27, .line = 2, .column = 16 },
},
tokenizer.readNextToken("more").?,
);