first lexer
This commit is contained in:
parent
6023eb1648
commit
12d58c95f8
4 changed files with 396 additions and 14 deletions
32
Cargo.lock
generated
32
Cargo.lock
generated
|
@ -2,6 +2,38 @@
|
|||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "dissimilar"
|
||||
version = "1.0.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8975ffdaa0ef3661bfe02dbdcc06c9f829dfafe6a3c474de366a8d5e44276921"
|
||||
|
||||
[[package]]
|
||||
name = "expect-test"
|
||||
version = "1.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "63af43ff4431e848fb47472a920f14fa71c24de13255a5692e93d4e90302acb0"
|
||||
dependencies = [
|
||||
"dissimilar",
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.21.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
|
||||
|
||||
[[package]]
|
||||
name = "wyrd"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"expect-test",
|
||||
"unicode-xid",
|
||||
]
|
||||
|
|
|
@ -7,3 +7,7 @@ license = "GPL-3.0-only"
|
|||
repository = "https://git.wires.systems/wires/wyrd"
|
||||
|
||||
[dependencies]
|
||||
unicode-xid = "0.2.6"
|
||||
|
||||
[dev-dependencies]
|
||||
expect-test = "1.5.1"
|
||||
|
|
359
src/lexer.rs
Normal file
359
src/lexer.rs
Normal file
|
@ -0,0 +1,359 @@
|
|||
use std::iter;
|
||||
use std::str::Chars;
|
||||
|
||||
use unicode_xid::UnicodeXID;
|
||||
|
||||
const EOF_CHAR: char = '\0';
|
||||
|
||||
fn is_whitespace(c: char) -> bool {
|
||||
matches!(
|
||||
c,
|
||||
// ASCII whitespace
|
||||
'\u{0009}' // \t
|
||||
| '\u{000B}' // vertical tab
|
||||
| '\u{000C}' // form feed
|
||||
| '\u{000D}' // \r
|
||||
| '\u{0020}' // space
|
||||
|
||||
// NEXT LINE from latin1
|
||||
| '\u{0085}'
|
||||
|
||||
// Bidi markers
|
||||
| '\u{200E}' // LEFT-TO-RIGHT MARK
|
||||
| '\u{200F}' // RIGHT-TO-LEFT MARK
|
||||
|
||||
// Dedicated whitespace characters from Unicode
|
||||
| '\u{2028}' // LINE SEPARATOR
|
||||
| '\u{2029}' // PARAGRAPH SEPARATOR
|
||||
)
|
||||
}
|
||||
|
||||
fn is_id_start(c: char) -> bool {
|
||||
c == '_' || c.is_xid_start()
|
||||
}
|
||||
|
||||
fn is_id_continue(c: char) -> bool {
|
||||
c.is_xid_continue()
|
||||
}
|
||||
|
||||
fn is_bin_digit(c: char) -> bool {
|
||||
matches!(c, '_' | '0' | '1')
|
||||
}
|
||||
|
||||
fn is_hex_digit(c: char) -> bool {
|
||||
matches!(c, '_' | '0'..='9' | 'a'..='z' | 'A'..='Z')
|
||||
}
|
||||
|
||||
fn is_dec_digit(c: char) -> bool {
|
||||
matches!(c, '_' | '0'..='9')
|
||||
}
|
||||
|
||||
#[repr(u8)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TokenKind {
|
||||
Unknown,
|
||||
Eof,
|
||||
Newline,
|
||||
Identifier,
|
||||
BinaryLiteral,
|
||||
HexLiteral,
|
||||
DecimalLiteral,
|
||||
String,
|
||||
UnterminatedString,
|
||||
/// ->
|
||||
Arrow,
|
||||
/// |>
|
||||
Pipeline,
|
||||
/// =
|
||||
Equal,
|
||||
/// ==
|
||||
DoubleEqual,
|
||||
/// !=
|
||||
NotEqual,
|
||||
/// !
|
||||
Bang,
|
||||
/// &
|
||||
Ampersand,
|
||||
/// |
|
||||
Bar,
|
||||
/// ^
|
||||
Caret,
|
||||
/// +
|
||||
Plus,
|
||||
/// -
|
||||
Minus,
|
||||
/// *
|
||||
Star,
|
||||
/// /
|
||||
Slash,
|
||||
/// %
|
||||
Percent,
|
||||
/// ,
|
||||
Comma,
|
||||
/// .
|
||||
Dot,
|
||||
/// :
|
||||
Colon,
|
||||
/// ;
|
||||
Semicolon,
|
||||
/// (
|
||||
LParen,
|
||||
/// )
|
||||
RParen,
|
||||
/// [
|
||||
LBracket,
|
||||
/// ]
|
||||
RBracket,
|
||||
/// {
|
||||
LBrace,
|
||||
/// }
|
||||
RBrace,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Token {
|
||||
kind: TokenKind,
|
||||
offset: u32,
|
||||
}
|
||||
|
||||
struct Cursor<'a> {
|
||||
chars: Chars<'a>,
|
||||
remaining: usize,
|
||||
offset: u32,
|
||||
}
|
||||
|
||||
impl<'a> Cursor<'a> {
|
||||
fn new(input: &'a str) -> Self {
|
||||
Self {
|
||||
chars: input.chars(),
|
||||
remaining: input.len(),
|
||||
offset: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn peek(&self) -> char {
|
||||
self.chars.clone().next().unwrap_or(EOF_CHAR)
|
||||
}
|
||||
|
||||
fn bump(&mut self) -> Option<char> {
|
||||
self.chars.next()
|
||||
}
|
||||
|
||||
fn as_str(&self) -> &str {
|
||||
self.chars.as_str()
|
||||
}
|
||||
|
||||
fn is_eof(&self) -> bool {
|
||||
self.as_str().is_empty()
|
||||
}
|
||||
|
||||
fn eat_while(&mut self, predicate: impl Fn(char) -> bool) {
|
||||
while predicate(self.peek()) && !self.is_eof() {
|
||||
self.bump();
|
||||
}
|
||||
}
|
||||
|
||||
fn next_token(&mut self) -> Token {
|
||||
self.offset += (self.remaining - self.as_str().len()) as u32;
|
||||
self.remaining = self.as_str().len();
|
||||
|
||||
let c = match self.bump() {
|
||||
Some(c) => c,
|
||||
None => {
|
||||
return Token {
|
||||
kind: TokenKind::Eof,
|
||||
offset: self.offset,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
let kind = match c {
|
||||
'\n' => TokenKind::Newline,
|
||||
c if is_whitespace(c) => {
|
||||
self.eat_while(is_whitespace);
|
||||
return self.next_token();
|
||||
}
|
||||
c if is_id_start(c) => {
|
||||
self.eat_while(is_id_continue);
|
||||
TokenKind::Identifier
|
||||
}
|
||||
'0' => self.int_literal_base(),
|
||||
'1'..='9' => self.int_literal(),
|
||||
'"' => self.string_literal(),
|
||||
'=' => {
|
||||
if self.peek() == '=' {
|
||||
self.bump();
|
||||
TokenKind::DoubleEqual
|
||||
} else {
|
||||
TokenKind::Equal
|
||||
}
|
||||
}
|
||||
'!' => {
|
||||
if self.peek() == '=' {
|
||||
self.bump();
|
||||
TokenKind::NotEqual
|
||||
} else {
|
||||
TokenKind::Bang
|
||||
}
|
||||
}
|
||||
'&' => TokenKind::Ampersand,
|
||||
'|' => {
|
||||
if self.peek() == '>' {
|
||||
self.bump();
|
||||
TokenKind::Pipeline
|
||||
} else {
|
||||
TokenKind::Bar
|
||||
}
|
||||
}
|
||||
'^' => TokenKind::Caret,
|
||||
'+' => TokenKind::Plus,
|
||||
'-' => {
|
||||
if self.peek() == '>' {
|
||||
self.bump();
|
||||
TokenKind::Arrow
|
||||
} else {
|
||||
TokenKind::Minus
|
||||
}
|
||||
}
|
||||
'*' => TokenKind::Star,
|
||||
'/' => TokenKind::Slash,
|
||||
',' => TokenKind::Comma,
|
||||
'.' => TokenKind::Dot,
|
||||
':' => TokenKind::Colon,
|
||||
';' => TokenKind::Semicolon,
|
||||
'(' => TokenKind::LParen,
|
||||
')' => TokenKind::RParen,
|
||||
'[' => TokenKind::LBracket,
|
||||
']' => TokenKind::RBracket,
|
||||
'{' => TokenKind::LBrace,
|
||||
'}' => TokenKind::RBrace,
|
||||
_ => todo!(),
|
||||
};
|
||||
|
||||
Token {
|
||||
kind,
|
||||
offset: self.offset,
|
||||
}
|
||||
}
|
||||
|
||||
fn int_literal(&mut self) -> TokenKind {
|
||||
self.eat_while(|c| c.is_ascii_digit());
|
||||
TokenKind::DecimalLiteral
|
||||
}
|
||||
|
||||
fn int_literal_base(&mut self) -> TokenKind {
|
||||
match self.peek() {
|
||||
'b' => {
|
||||
self.bump();
|
||||
self.eat_while(is_bin_digit);
|
||||
TokenKind::BinaryLiteral
|
||||
}
|
||||
'x' => {
|
||||
self.bump();
|
||||
self.eat_while(is_hex_digit);
|
||||
TokenKind::HexLiteral
|
||||
}
|
||||
'0'..='9' => {
|
||||
self.bump();
|
||||
self.eat_while(is_dec_digit);
|
||||
TokenKind::DecimalLiteral
|
||||
}
|
||||
_ => TokenKind::DecimalLiteral,
|
||||
}
|
||||
}
|
||||
|
||||
fn string_literal(&mut self) -> TokenKind {
|
||||
while let Some(c) = self.bump() {
|
||||
if c == '"' {
|
||||
return TokenKind::String;
|
||||
} else if c == '\\' {
|
||||
self.bump();
|
||||
}
|
||||
}
|
||||
TokenKind::UnterminatedString
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> {
|
||||
let mut cursor = Cursor::new(input);
|
||||
iter::from_fn(move || {
|
||||
let t = cursor.next_token();
|
||||
(t.kind != TokenKind::Eof).then_some(t)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use expect_test::{Expect, expect};
|
||||
|
||||
use super::*;
|
||||
|
||||
fn check_lexing(src: &str, expect: Expect) {
|
||||
let actual: String = tokenize(src).map(|t| format!("{t:?}\n")).collect();
|
||||
expect.assert_eq(&actual);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn smoke_test() {
|
||||
check_lexing(
|
||||
"fn sum([h | t]: [int], acc: int) -> sum(t, acc + h)",
|
||||
expect![[r#"
|
||||
Token { kind: Identifier, offset: 0 }
|
||||
Token { kind: Identifier, offset: 3 }
|
||||
Token { kind: LParen, offset: 6 }
|
||||
Token { kind: LBracket, offset: 7 }
|
||||
Token { kind: Identifier, offset: 8 }
|
||||
Token { kind: Bar, offset: 10 }
|
||||
Token { kind: Identifier, offset: 12 }
|
||||
Token { kind: RBracket, offset: 13 }
|
||||
Token { kind: Colon, offset: 14 }
|
||||
Token { kind: LBracket, offset: 16 }
|
||||
Token { kind: Identifier, offset: 17 }
|
||||
Token { kind: RBracket, offset: 20 }
|
||||
Token { kind: Comma, offset: 21 }
|
||||
Token { kind: Identifier, offset: 23 }
|
||||
Token { kind: Colon, offset: 26 }
|
||||
Token { kind: Identifier, offset: 28 }
|
||||
Token { kind: RParen, offset: 31 }
|
||||
Token { kind: Arrow, offset: 33 }
|
||||
Token { kind: Identifier, offset: 36 }
|
||||
Token { kind: LParen, offset: 39 }
|
||||
Token { kind: Identifier, offset: 40 }
|
||||
Token { kind: Comma, offset: 41 }
|
||||
Token { kind: Identifier, offset: 43 }
|
||||
Token { kind: Plus, offset: 47 }
|
||||
Token { kind: Identifier, offset: 49 }
|
||||
Token { kind: RParen, offset: 50 }
|
||||
"#]],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn int_literals() {
|
||||
check_lexing(
|
||||
"0xDeAdBeEf 0b_1101_1011 0987 1337",
|
||||
expect![[r#"
|
||||
Token { kind: HexLiteral, offset: 0 }
|
||||
Token { kind: BinaryLiteral, offset: 11 }
|
||||
Token { kind: DecimalLiteral, offset: 24 }
|
||||
Token { kind: DecimalLiteral, offset: 29 }
|
||||
"#]],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strings() {
|
||||
check_lexing(
|
||||
r#"
|
||||
"meowing wow \""
|
||||
"abc \"
|
||||
"#,
|
||||
expect![[r#"
|
||||
Token { kind: Newline, offset: 0 }
|
||||
Token { kind: String, offset: 13 }
|
||||
Token { kind: Newline, offset: 29 }
|
||||
Token { kind: UnterminatedString, offset: 42 }
|
||||
"#]],
|
||||
);
|
||||
}
|
||||
}
|
15
src/lib.rs
15
src/lib.rs
|
@ -1,14 +1 @@
|
|||
pub fn add(left: u64, right: u64) -> u64 {
|
||||
left + right
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn it_works() {
|
||||
let result = add(2, 2);
|
||||
assert_eq!(result, 4);
|
||||
}
|
||||
}
|
||||
mod lexer;
|
||||
|
|
Loading…
Add table
Reference in a new issue