first lexer

This commit is contained in:
wires 2025-06-04 17:36:40 -04:00
parent 6023eb1648
commit 12d58c95f8
Signed by: wires
SSH key fingerprint: SHA256:9GtP+M3O2IivPDlw1UY872UPUuJH2gI0yG6ExBxaaiM
4 changed files with 396 additions and 14 deletions

32
Cargo.lock generated
View file

@ -2,6 +2,38 @@
# It is not intended for manual editing.
version = 4
[[package]]
name = "dissimilar"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8975ffdaa0ef3661bfe02dbdcc06c9f829dfafe6a3c474de366a8d5e44276921"
[[package]]
name = "expect-test"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63af43ff4431e848fb47472a920f14fa71c24de13255a5692e93d4e90302acb0"
dependencies = [
"dissimilar",
"once_cell",
]
[[package]]
name = "once_cell"
version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
[[package]]
name = "unicode-xid"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
[[package]]
name = "wyrd"
version = "0.1.0"
dependencies = [
"expect-test",
"unicode-xid",
]

View file

@ -7,3 +7,7 @@ license = "GPL-3.0-only"
repository = "https://git.wires.systems/wires/wyrd"
[dependencies]
unicode-xid = "0.2.6"
[dev-dependencies]
expect-test = "1.5.1"

359
src/lexer.rs Normal file
View file

@ -0,0 +1,359 @@
use std::iter;
use std::str::Chars;
use unicode_xid::UnicodeXID;
const EOF_CHAR: char = '\0';
fn is_whitespace(c: char) -> bool {
matches!(
c,
// ASCII whitespace
'\u{0009}' // \t
| '\u{000B}' // vertical tab
| '\u{000C}' // form feed
| '\u{000D}' // \r
| '\u{0020}' // space
// NEXT LINE from latin1
| '\u{0085}'
// Bidi markers
| '\u{200E}' // LEFT-TO-RIGHT MARK
| '\u{200F}' // RIGHT-TO-LEFT MARK
// Dedicated whitespace characters from Unicode
| '\u{2028}' // LINE SEPARATOR
| '\u{2029}' // PARAGRAPH SEPARATOR
)
}
fn is_id_start(c: char) -> bool {
c == '_' || c.is_xid_start()
}
fn is_id_continue(c: char) -> bool {
c.is_xid_continue()
}
fn is_bin_digit(c: char) -> bool {
matches!(c, '_' | '0' | '1')
}
fn is_hex_digit(c: char) -> bool {
matches!(c, '_' | '0'..='9' | 'a'..='z' | 'A'..='Z')
}
fn is_dec_digit(c: char) -> bool {
matches!(c, '_' | '0'..='9')
}
#[repr(u8)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenKind {
Unknown,
Eof,
Newline,
Identifier,
BinaryLiteral,
HexLiteral,
DecimalLiteral,
String,
UnterminatedString,
/// ->
Arrow,
/// |>
Pipeline,
/// =
Equal,
/// ==
DoubleEqual,
/// !=
NotEqual,
/// !
Bang,
/// &
Ampersand,
/// |
Bar,
/// ^
Caret,
/// +
Plus,
/// -
Minus,
/// *
Star,
/// /
Slash,
/// %
Percent,
/// ,
Comma,
/// .
Dot,
/// :
Colon,
/// ;
Semicolon,
/// (
LParen,
/// )
RParen,
/// [
LBracket,
/// ]
RBracket,
/// {
LBrace,
/// }
RBrace,
}
#[derive(Debug)]
pub struct Token {
kind: TokenKind,
offset: u32,
}
struct Cursor<'a> {
chars: Chars<'a>,
remaining: usize,
offset: u32,
}
impl<'a> Cursor<'a> {
fn new(input: &'a str) -> Self {
Self {
chars: input.chars(),
remaining: input.len(),
offset: 0,
}
}
fn peek(&self) -> char {
self.chars.clone().next().unwrap_or(EOF_CHAR)
}
fn bump(&mut self) -> Option<char> {
self.chars.next()
}
fn as_str(&self) -> &str {
self.chars.as_str()
}
fn is_eof(&self) -> bool {
self.as_str().is_empty()
}
fn eat_while(&mut self, predicate: impl Fn(char) -> bool) {
while predicate(self.peek()) && !self.is_eof() {
self.bump();
}
}
fn next_token(&mut self) -> Token {
self.offset += (self.remaining - self.as_str().len()) as u32;
self.remaining = self.as_str().len();
let c = match self.bump() {
Some(c) => c,
None => {
return Token {
kind: TokenKind::Eof,
offset: self.offset,
};
}
};
let kind = match c {
'\n' => TokenKind::Newline,
c if is_whitespace(c) => {
self.eat_while(is_whitespace);
return self.next_token();
}
c if is_id_start(c) => {
self.eat_while(is_id_continue);
TokenKind::Identifier
}
'0' => self.int_literal_base(),
'1'..='9' => self.int_literal(),
'"' => self.string_literal(),
'=' => {
if self.peek() == '=' {
self.bump();
TokenKind::DoubleEqual
} else {
TokenKind::Equal
}
}
'!' => {
if self.peek() == '=' {
self.bump();
TokenKind::NotEqual
} else {
TokenKind::Bang
}
}
'&' => TokenKind::Ampersand,
'|' => {
if self.peek() == '>' {
self.bump();
TokenKind::Pipeline
} else {
TokenKind::Bar
}
}
'^' => TokenKind::Caret,
'+' => TokenKind::Plus,
'-' => {
if self.peek() == '>' {
self.bump();
TokenKind::Arrow
} else {
TokenKind::Minus
}
}
'*' => TokenKind::Star,
'/' => TokenKind::Slash,
',' => TokenKind::Comma,
'.' => TokenKind::Dot,
':' => TokenKind::Colon,
';' => TokenKind::Semicolon,
'(' => TokenKind::LParen,
')' => TokenKind::RParen,
'[' => TokenKind::LBracket,
']' => TokenKind::RBracket,
'{' => TokenKind::LBrace,
'}' => TokenKind::RBrace,
_ => todo!(),
};
Token {
kind,
offset: self.offset,
}
}
fn int_literal(&mut self) -> TokenKind {
self.eat_while(|c| c.is_ascii_digit());
TokenKind::DecimalLiteral
}
fn int_literal_base(&mut self) -> TokenKind {
match self.peek() {
'b' => {
self.bump();
self.eat_while(is_bin_digit);
TokenKind::BinaryLiteral
}
'x' => {
self.bump();
self.eat_while(is_hex_digit);
TokenKind::HexLiteral
}
'0'..='9' => {
self.bump();
self.eat_while(is_dec_digit);
TokenKind::DecimalLiteral
}
_ => TokenKind::DecimalLiteral,
}
}
fn string_literal(&mut self) -> TokenKind {
while let Some(c) = self.bump() {
if c == '"' {
return TokenKind::String;
} else if c == '\\' {
self.bump();
}
}
TokenKind::UnterminatedString
}
}
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> {
let mut cursor = Cursor::new(input);
iter::from_fn(move || {
let t = cursor.next_token();
(t.kind != TokenKind::Eof).then_some(t)
})
}
#[cfg(test)]
mod test {
use expect_test::{Expect, expect};
use super::*;
fn check_lexing(src: &str, expect: Expect) {
let actual: String = tokenize(src).map(|t| format!("{t:?}\n")).collect();
expect.assert_eq(&actual);
}
#[test]
fn smoke_test() {
check_lexing(
"fn sum([h | t]: [int], acc: int) -> sum(t, acc + h)",
expect![[r#"
Token { kind: Identifier, offset: 0 }
Token { kind: Identifier, offset: 3 }
Token { kind: LParen, offset: 6 }
Token { kind: LBracket, offset: 7 }
Token { kind: Identifier, offset: 8 }
Token { kind: Bar, offset: 10 }
Token { kind: Identifier, offset: 12 }
Token { kind: RBracket, offset: 13 }
Token { kind: Colon, offset: 14 }
Token { kind: LBracket, offset: 16 }
Token { kind: Identifier, offset: 17 }
Token { kind: RBracket, offset: 20 }
Token { kind: Comma, offset: 21 }
Token { kind: Identifier, offset: 23 }
Token { kind: Colon, offset: 26 }
Token { kind: Identifier, offset: 28 }
Token { kind: RParen, offset: 31 }
Token { kind: Arrow, offset: 33 }
Token { kind: Identifier, offset: 36 }
Token { kind: LParen, offset: 39 }
Token { kind: Identifier, offset: 40 }
Token { kind: Comma, offset: 41 }
Token { kind: Identifier, offset: 43 }
Token { kind: Plus, offset: 47 }
Token { kind: Identifier, offset: 49 }
Token { kind: RParen, offset: 50 }
"#]],
);
}
#[test]
fn int_literals() {
check_lexing(
"0xDeAdBeEf 0b_1101_1011 0987 1337",
expect![[r#"
Token { kind: HexLiteral, offset: 0 }
Token { kind: BinaryLiteral, offset: 11 }
Token { kind: DecimalLiteral, offset: 24 }
Token { kind: DecimalLiteral, offset: 29 }
"#]],
);
}
#[test]
fn strings() {
check_lexing(
r#"
"meowing wow \""
"abc \"
"#,
expect![[r#"
Token { kind: Newline, offset: 0 }
Token { kind: String, offset: 13 }
Token { kind: Newline, offset: 29 }
Token { kind: UnterminatedString, offset: 42 }
"#]],
);
}
}

View file

@ -1,14 +1 @@
pub fn add(left: u64, right: u64) -> u64 {
left + right
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn it_works() {
let result = add(2, 2);
assert_eq!(result, 4);
}
}
mod lexer;