diff --git a/Cargo.lock b/Cargo.lock index af8dfb3..9600122 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,38 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "dissimilar" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8975ffdaa0ef3661bfe02dbdcc06c9f829dfafe6a3c474de366a8d5e44276921" + +[[package]] +name = "expect-test" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63af43ff4431e848fb47472a920f14fa71c24de13255a5692e93d4e90302acb0" +dependencies = [ + "dissimilar", + "once_cell", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + [[package]] name = "wyrd" version = "0.1.0" +dependencies = [ + "expect-test", + "unicode-xid", +] diff --git a/Cargo.toml b/Cargo.toml index ecbe8f9..a2393db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,3 +7,7 @@ license = "GPL-3.0-only" repository = "https://git.wires.systems/wires/wyrd" [dependencies] +unicode-xid = "0.2.6" + +[dev-dependencies] +expect-test = "1.5.1" diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..86859e6 --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,359 @@ +use std::iter; +use std::str::Chars; + +use unicode_xid::UnicodeXID; + +const EOF_CHAR: char = '\0'; + +fn is_whitespace(c: char) -> bool { + matches!( + c, + // ASCII whitespace + '\u{0009}' // \t + | '\u{000B}' // vertical tab + | '\u{000C}' // form feed + | '\u{000D}' // \r + | '\u{0020}' // space + + // NEXT LINE from latin1 + | '\u{0085}' + + // Bidi markers + | '\u{200E}' // LEFT-TO-RIGHT MARK + | '\u{200F}' // RIGHT-TO-LEFT MARK + + // Dedicated whitespace characters from Unicode + | '\u{2028}' // LINE SEPARATOR + | '\u{2029}' // PARAGRAPH SEPARATOR + ) +} + +fn is_id_start(c: char) -> bool { + c == '_' || c.is_xid_start() +} + +fn is_id_continue(c: char) -> bool { + c.is_xid_continue() +} + +fn is_bin_digit(c: char) -> bool { + matches!(c, '_' | '0' | '1') +} + +fn is_hex_digit(c: char) -> bool { + matches!(c, '_' | '0'..='9' | 'a'..='z' | 'A'..='Z') +} + +fn is_dec_digit(c: char) -> bool { + matches!(c, '_' | '0'..='9') +} + +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TokenKind { + Unknown, + Eof, + Newline, + Identifier, + BinaryLiteral, + HexLiteral, + DecimalLiteral, + String, + UnterminatedString, + /// -> + Arrow, + /// |> + Pipeline, + /// = + Equal, + /// == + DoubleEqual, + /// != + NotEqual, + /// ! + Bang, + /// & + Ampersand, + /// | + Bar, + /// ^ + Caret, + /// + + Plus, + /// - + Minus, + /// * + Star, + /// / + Slash, + /// % + Percent, + /// , + Comma, + /// . + Dot, + /// : + Colon, + /// ; + Semicolon, + /// ( + LParen, + /// ) + RParen, + /// [ + LBracket, + /// ] + RBracket, + /// { + LBrace, + /// } + RBrace, +} + +#[derive(Debug)] +pub struct Token { + kind: TokenKind, + offset: u32, +} + +struct Cursor<'a> { + chars: Chars<'a>, + remaining: usize, + offset: u32, +} + +impl<'a> Cursor<'a> { + fn new(input: &'a str) -> Self { + Self { + chars: input.chars(), + remaining: input.len(), + offset: 0, + } + } + + fn peek(&self) -> char { + self.chars.clone().next().unwrap_or(EOF_CHAR) + } + + fn bump(&mut self) -> Option { + self.chars.next() + } + + fn as_str(&self) -> &str { + self.chars.as_str() + } + + fn is_eof(&self) -> bool { + self.as_str().is_empty() + } + + fn eat_while(&mut self, predicate: impl Fn(char) -> bool) { + while predicate(self.peek()) && !self.is_eof() { + self.bump(); + } + } + + fn next_token(&mut self) -> Token { + self.offset += (self.remaining - self.as_str().len()) as u32; + self.remaining = self.as_str().len(); + + let c = match self.bump() { + Some(c) => c, + None => { + return Token { + kind: TokenKind::Eof, + offset: self.offset, + }; + } + }; + + let kind = match c { + '\n' => TokenKind::Newline, + c if is_whitespace(c) => { + self.eat_while(is_whitespace); + return self.next_token(); + } + c if is_id_start(c) => { + self.eat_while(is_id_continue); + TokenKind::Identifier + } + '0' => self.int_literal_base(), + '1'..='9' => self.int_literal(), + '"' => self.string_literal(), + '=' => { + if self.peek() == '=' { + self.bump(); + TokenKind::DoubleEqual + } else { + TokenKind::Equal + } + } + '!' => { + if self.peek() == '=' { + self.bump(); + TokenKind::NotEqual + } else { + TokenKind::Bang + } + } + '&' => TokenKind::Ampersand, + '|' => { + if self.peek() == '>' { + self.bump(); + TokenKind::Pipeline + } else { + TokenKind::Bar + } + } + '^' => TokenKind::Caret, + '+' => TokenKind::Plus, + '-' => { + if self.peek() == '>' { + self.bump(); + TokenKind::Arrow + } else { + TokenKind::Minus + } + } + '*' => TokenKind::Star, + '/' => TokenKind::Slash, + ',' => TokenKind::Comma, + '.' => TokenKind::Dot, + ':' => TokenKind::Colon, + ';' => TokenKind::Semicolon, + '(' => TokenKind::LParen, + ')' => TokenKind::RParen, + '[' => TokenKind::LBracket, + ']' => TokenKind::RBracket, + '{' => TokenKind::LBrace, + '}' => TokenKind::RBrace, + _ => todo!(), + }; + + Token { + kind, + offset: self.offset, + } + } + + fn int_literal(&mut self) -> TokenKind { + self.eat_while(|c| c.is_ascii_digit()); + TokenKind::DecimalLiteral + } + + fn int_literal_base(&mut self) -> TokenKind { + match self.peek() { + 'b' => { + self.bump(); + self.eat_while(is_bin_digit); + TokenKind::BinaryLiteral + } + 'x' => { + self.bump(); + self.eat_while(is_hex_digit); + TokenKind::HexLiteral + } + '0'..='9' => { + self.bump(); + self.eat_while(is_dec_digit); + TokenKind::DecimalLiteral + } + _ => TokenKind::DecimalLiteral, + } + } + + fn string_literal(&mut self) -> TokenKind { + while let Some(c) = self.bump() { + if c == '"' { + return TokenKind::String; + } else if c == '\\' { + self.bump(); + } + } + TokenKind::UnterminatedString + } +} + +pub fn tokenize(input: &str) -> impl Iterator { + let mut cursor = Cursor::new(input); + iter::from_fn(move || { + let t = cursor.next_token(); + (t.kind != TokenKind::Eof).then_some(t) + }) +} + +#[cfg(test)] +mod test { + use expect_test::{Expect, expect}; + + use super::*; + + fn check_lexing(src: &str, expect: Expect) { + let actual: String = tokenize(src).map(|t| format!("{t:?}\n")).collect(); + expect.assert_eq(&actual); + } + + #[test] + fn smoke_test() { + check_lexing( + "fn sum([h | t]: [int], acc: int) -> sum(t, acc + h)", + expect![[r#" + Token { kind: Identifier, offset: 0 } + Token { kind: Identifier, offset: 3 } + Token { kind: LParen, offset: 6 } + Token { kind: LBracket, offset: 7 } + Token { kind: Identifier, offset: 8 } + Token { kind: Bar, offset: 10 } + Token { kind: Identifier, offset: 12 } + Token { kind: RBracket, offset: 13 } + Token { kind: Colon, offset: 14 } + Token { kind: LBracket, offset: 16 } + Token { kind: Identifier, offset: 17 } + Token { kind: RBracket, offset: 20 } + Token { kind: Comma, offset: 21 } + Token { kind: Identifier, offset: 23 } + Token { kind: Colon, offset: 26 } + Token { kind: Identifier, offset: 28 } + Token { kind: RParen, offset: 31 } + Token { kind: Arrow, offset: 33 } + Token { kind: Identifier, offset: 36 } + Token { kind: LParen, offset: 39 } + Token { kind: Identifier, offset: 40 } + Token { kind: Comma, offset: 41 } + Token { kind: Identifier, offset: 43 } + Token { kind: Plus, offset: 47 } + Token { kind: Identifier, offset: 49 } + Token { kind: RParen, offset: 50 } + "#]], + ); + } + + #[test] + fn int_literals() { + check_lexing( + "0xDeAdBeEf 0b_1101_1011 0987 1337", + expect![[r#" + Token { kind: HexLiteral, offset: 0 } + Token { kind: BinaryLiteral, offset: 11 } + Token { kind: DecimalLiteral, offset: 24 } + Token { kind: DecimalLiteral, offset: 29 } + "#]], + ); + } + + #[test] + fn strings() { + check_lexing( + r#" + "meowing wow \"" + "abc \" + "#, + expect![[r#" + Token { kind: Newline, offset: 0 } + Token { kind: String, offset: 13 } + Token { kind: Newline, offset: 29 } + Token { kind: UnterminatedString, offset: 42 } + "#]], + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index b93cf3f..a5464ca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,14 +1 @@ -pub fn add(left: u64, right: u64) -> u64 { - left + right -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } -} +mod lexer;