132 lines
3.6 KiB
Erlang
132 lines
3.6 KiB
Erlang
-module(akh_lex).
|
|
-moduledoc """
|
|
This module contains functions for tokenizing Akhamoth source code.
|
|
""".
|
|
|
|
-export([
|
|
new/1,
|
|
source_map/1,
|
|
next/1
|
|
]).
|
|
|
|
-export_type([lexer/0]).
|
|
|
|
-define(is_digit(C), C >= $0, C =< $9).
|
|
-define(is_id_start(C), C >= $a, C =< $z; C >= $A, C =< $Z; C =:= $_).
|
|
-define(is_space(C), C =:= $\s; C =:= $\t).
|
|
|
|
-define(is_op2(T), T =:= <<"|>">>; T =:= <<"=>">>; T =:= <<"->">>; T =:= <<"==">>).
|
|
-define(is_op1(T),
|
|
T =:= <<"+">>;
|
|
T =:= <<"-">>;
|
|
T =:= <<"*">>;
|
|
T =:= <<"/">>;
|
|
T =:= <<"=">>;
|
|
T =:= <<".">>
|
|
).
|
|
|
|
-doc """
|
|
Tokens for which the category is the same as the content.
|
|
""".
|
|
-type token_simple() ::
|
|
'.'
|
|
| '+'
|
|
| '-'
|
|
| '*'
|
|
| '/'
|
|
| '='
|
|
| '|>'
|
|
| '=>'
|
|
| '->'
|
|
| '=='
|
|
| ','
|
|
| ':'
|
|
| '('
|
|
| ')'
|
|
| '['
|
|
| ']'
|
|
| '{'
|
|
| '}'.
|
|
|
|
-doc """
|
|
Tokens for which there is content beyond the category.
|
|
""".
|
|
-type token_complex() :: id | number | unknown.
|
|
|
|
-doc """
|
|
A token in the input stream.
|
|
""".
|
|
-type token() ::
|
|
{token_simple(), Position :: non_neg_integer() | inserted}
|
|
| {token_complex(), Position :: non_neg_integer(), Length :: pos_integer()}.
|
|
|
|
-record(lexer, {
|
|
source :: binary(),
|
|
offset = 0 :: non_neg_integer(),
|
|
source_map :: akh_source_map:source_map()
|
|
}).
|
|
|
|
-opaque lexer() :: #lexer{}.
|
|
|
|
-type return() :: none | {ok, token(), lexer()}.
|
|
|
|
%%% exports
|
|
|
|
-doc """
|
|
Initializes a lexer to tokenize the given binary.
|
|
""".
|
|
-spec new(binary()) -> lexer().
|
|
new(Source) -> #lexer{source = Source, source_map = akh_source_map:new()}.
|
|
|
|
-doc """
|
|
Returns the source map for a lexer.
|
|
""".
|
|
-spec source_map(lexer()) -> akh_source_map:source_map().
|
|
source_map(#lexer{source_map = SourceMap}) -> SourceMap.
|
|
|
|
-doc """
|
|
next(Lexer)
|
|
|
|
Attempts to get the next token in the input.
|
|
""".
|
|
-spec next(lexer()) -> none | {ok, token(), lexer()}.
|
|
next(#lexer{source = <<C, Rest/binary>>} = Lx) when ?is_id_start(C) ->
|
|
lex_id(Lx#lexer{source = Rest}, 1);
|
|
next(#lexer{source = <<C, Rest/binary>>} = Lx) when ?is_digit(C) ->
|
|
lex_number(Lx#lexer{source = Rest}, 1);
|
|
next(#lexer{source = <<C, Rest/binary>>} = Lx) when ?is_space(C) ->
|
|
next(Lx#lexer{source = Rest, offset = Lx#lexer.offset + 1});
|
|
next(#lexer{source = <<T:2/binary, Rest/binary>>, offset = Offset} = Lx) when ?is_op2(T) ->
|
|
{ok, {binary_to_atom(T), Offset}, Lx#lexer{source = Rest, offset = Offset + 2}};
|
|
next(#lexer{source = <<T:1/binary, Rest/binary>>, offset = Offset} = Lx) when ?is_op1(T) ->
|
|
{ok, {binary_to_atom(T), Offset}, Lx#lexer{source = Rest, offset = Offset + 1}};
|
|
next(#lexer{source = <<$\n, Rest/binary>>} = Lx) ->
|
|
new_line(Lx#lexer{source = Rest, offset = Lx#lexer.offset + 1});
|
|
next(#lexer{source = <<$\r, $\n, Rest/binary>>} = Lx) ->
|
|
new_line(Lx#lexer{source = Rest, offset = Lx#lexer.offset + 2});
|
|
next(#lexer{source = <<>>}) ->
|
|
none.
|
|
|
|
%%% local functions
|
|
|
|
-spec lex_id(#lexer{}, non_neg_integer()) -> return().
|
|
lex_id(
|
|
#lexer{source = <<C, Rest/binary>>} = Lx,
|
|
Len
|
|
) when ?is_id_start(C); ?is_digit(C) ->
|
|
lex_id(Lx#lexer{source = Rest}, Len + 1);
|
|
lex_id(#lexer{offset = Offset} = Lx, Len) ->
|
|
{ok, {id, Offset, Len}, Lx#lexer{offset = Offset + Len}}.
|
|
|
|
-spec lex_number(#lexer{}, non_neg_integer()) -> return().
|
|
lex_number(
|
|
#lexer{source = <<C, Rest/binary>>} = Lx,
|
|
Len
|
|
) when ?is_digit(C); C =:= $_ ->
|
|
lex_number(Lx#lexer{source = Rest}, Len + 1);
|
|
lex_number(#lexer{offset = Offset} = Lx, Len) ->
|
|
{ok, {number, Offset, Len}, Lx#lexer{offset = Offset + Len}}.
|
|
|
|
-spec new_line(#lexer{}) -> return().
|
|
new_line(#lexer{source_map = SourceMap} = Lx) ->
|
|
next(Lx#lexer{source_map = akh_source_map:insert(Lx#lexer.offset, SourceMap)}).
|