akhamoth/src/akh_lex.erl
2025-03-12 11:15:08 -04:00

132 lines
3.6 KiB
Erlang

-module(akh_lex).
-moduledoc """
This module contains functions for tokenizing Akhamoth source code.
""".
-export([
new/1,
source_map/1,
next/1
]).
-export_type([lexer/0]).
-define(is_digit(C), C >= $0, C =< $9).
-define(is_id_start(C), C >= $a, C =< $z; C >= $A, C =< $Z; C =:= $_).
-define(is_space(C), C =:= $\s; C =:= $\t).
-define(is_op2(T), T =:= <<"|>">>; T =:= <<"=>">>; T =:= <<"->">>; T =:= <<"==">>).
-define(is_op1(T),
T =:= <<"+">>;
T =:= <<"-">>;
T =:= <<"*">>;
T =:= <<"/">>;
T =:= <<"=">>;
T =:= <<".">>
).
-doc """
Tokens for which the category is the same as the content.
""".
-type token_simple() ::
'.'
| '+'
| '-'
| '*'
| '/'
| '='
| '|>'
| '=>'
| '->'
| '=='
| ','
| ':'
| '('
| ')'
| '['
| ']'
| '{'
| '}'.
-doc """
Tokens for which there is content beyond the category.
""".
-type token_complex() :: id | number | unknown.
-doc """
A token in the input stream.
""".
-type token() ::
{token_simple(), Position :: non_neg_integer() | inserted}
| {token_complex(), Position :: non_neg_integer(), Length :: pos_integer()}.
-record(lexer, {
source :: binary(),
offset = 0 :: non_neg_integer(),
source_map :: akh_source_map:source_map()
}).
-opaque lexer() :: #lexer{}.
-type return() :: none | {ok, token(), lexer()}.
%%% exports
-doc """
Initializes a lexer to tokenize the given binary.
""".
-spec new(binary()) -> lexer().
new(Source) -> #lexer{source = Source, source_map = akh_source_map:new()}.
-doc """
Returns the source map for a lexer.
""".
-spec source_map(lexer()) -> akh_source_map:source_map().
source_map(#lexer{source_map = SourceMap}) -> SourceMap.
-doc """
next(Lexer)
Attempts to get the next token in the input.
""".
-spec next(lexer()) -> none | {ok, token(), lexer()}.
next(#lexer{source = <<C, Rest/binary>>} = Lx) when ?is_id_start(C) ->
lex_id(Lx#lexer{source = Rest}, 1);
next(#lexer{source = <<C, Rest/binary>>} = Lx) when ?is_digit(C) ->
lex_number(Lx#lexer{source = Rest}, 1);
next(#lexer{source = <<C, Rest/binary>>} = Lx) when ?is_space(C) ->
next(Lx#lexer{source = Rest, offset = Lx#lexer.offset + 1});
next(#lexer{source = <<T:2/binary, Rest/binary>>, offset = Offset} = Lx) when ?is_op2(T) ->
{ok, {binary_to_atom(T), Offset}, Lx#lexer{source = Rest, offset = Offset + 2}};
next(#lexer{source = <<T:1/binary, Rest/binary>>, offset = Offset} = Lx) when ?is_op1(T) ->
{ok, {binary_to_atom(T), Offset}, Lx#lexer{source = Rest, offset = Offset + 1}};
next(#lexer{source = <<$\n, Rest/binary>>} = Lx) ->
new_line(Lx#lexer{source = Rest, offset = Lx#lexer.offset + 1});
next(#lexer{source = <<$\r, $\n, Rest/binary>>} = Lx) ->
new_line(Lx#lexer{source = Rest, offset = Lx#lexer.offset + 2});
next(#lexer{source = <<>>}) ->
none.
%%% local functions
-spec lex_id(#lexer{}, non_neg_integer()) -> return().
lex_id(
#lexer{source = <<C, Rest/binary>>} = Lx,
Len
) when ?is_id_start(C); ?is_digit(C) ->
lex_id(Lx#lexer{source = Rest}, Len + 1);
lex_id(#lexer{offset = Offset} = Lx, Len) ->
{ok, {id, Offset, Len}, Lx#lexer{offset = Offset + Len}}.
-spec lex_number(#lexer{}, non_neg_integer()) -> return().
lex_number(
#lexer{source = <<C, Rest/binary>>} = Lx,
Len
) when ?is_digit(C); C =:= $_ ->
lex_number(Lx#lexer{source = Rest}, Len + 1);
lex_number(#lexer{offset = Offset} = Lx, Len) ->
{ok, {number, Offset, Len}, Lx#lexer{offset = Offset + Len}}.
-spec new_line(#lexer{}) -> return().
new_line(#lexer{source_map = SourceMap} = Lx) ->
next(Lx#lexer{source_map = akh_source_map:insert(Lx#lexer.offset, SourceMap)}).