From c7116f9bd0471f8638b888472426e383f64cbcdc Mon Sep 17 00:00:00 2001 From: Uko Kokņevičs Date: Sun, 18 Aug 2024 23:29:37 +0800 Subject: Some more modularisation --- lexer/src/main/java/lv/enes/orang/lexer/Lexer.java | 197 +++++++++++++++++++++ lexer/src/main/java/lv/enes/orang/lexer/Token.java | 62 +++++++ 2 files changed, 259 insertions(+) create mode 100644 lexer/src/main/java/lv/enes/orang/lexer/Lexer.java create mode 100644 lexer/src/main/java/lv/enes/orang/lexer/Token.java (limited to 'lexer/src/main/java/lv') diff --git a/lexer/src/main/java/lv/enes/orang/lexer/Lexer.java b/lexer/src/main/java/lv/enes/orang/lexer/Lexer.java new file mode 100644 index 0000000..8fec98e --- /dev/null +++ b/lexer/src/main/java/lv/enes/orang/lexer/Lexer.java @@ -0,0 +1,197 @@ +package lv.enes.orang.lexer; + +import lv.enes.orang.utils.Codepoint; +import lv.enes.orang.utils.PeekableStream; + +import java.io.*; +import java.util.Iterator; +import java.util.function.BiFunction; +import java.util.function.Predicate; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +public class Lexer implements Iterator { + public static boolean isIdentInitial(Codepoint cp) { + return Character.isLetter(cp.cp()) || cp.cp() == '_'; + } + + public static boolean isIdentFinal(Codepoint cp) { + return isIdentInitial(cp) || Character.isDigit(cp.cp()); + } + + public static boolean isNewline(Codepoint cp) { + return cp.cp() == '\n'; + } + + public static boolean isNumeral(Codepoint cp) { + return Character.isDigit(cp.cp()); + } + + public static boolean isWhitespace(Codepoint cp) { + return Character.isWhitespace(cp.cp()); + } + + private final PeekableStream input; + + public Lexer(InputStream input) { + this(new InputStreamReader(input)); + } + + public Lexer(Reader input) { + var cpStream = new BufferedReader(input) + .lines() + .flatMapToInt(str -> IntStream.concat(str.codePoints(), IntStream.of('\n'))) + .mapToObj(Codepoint::new); + var theEof = Stream.of(new Codepoint(-1)); + this.input = new PeekableStream<>(Stream.concat(cpStream, theEof).iterator()); + } + + public Lexer(String input) { + this(new StringReader(input)); + } + + private boolean hasNext = true; + + @Override + public Token next() { + var tok = nextToken(); + if (tok.type() == Token.Type.EOF) { + hasNext = false; + } + return tok; + } + + @Override + public boolean hasNext() { + return hasNext; + } + + private Token nextToken() { + skipWhitespace(); + return switch (input.peek().cp()) { + case -1 -> new Token(Token.Type.EOF, ""); + + case '*' -> new Token(Token.Type.ASTERISK, input.next()); + case '!' -> new Token(Token.Type.BANG, input.next()); + case '[' -> new Token(Token.Type.BRACKET_LEFT, input.next()); + case ']' -> new Token(Token.Type.BRACKET_RIGHT, input.next()); + case ',' -> new Token(Token.Type.COMMA, input.next()); + case '=' -> new Token(Token.Type.EQUAL, input.next()); + case '>' -> { + var first = input.next(); + if (input.peek().cp() == '=') { + yield new Token(Token.Type.GREATER_EQUAL, first, input.next()); + } else { + yield new Token(Token.Type.GREATER, first); + } + } + case '<' -> { + var first = input.next(); + if (input.peek().cp() == '=') { + yield new Token(Token.Type.LESS_EQUAL, first, input.next()); + } else { + yield new Token(Token.Type.LESS, first); + } + } + case '-' -> { + var first = input.next(); + if (input.peek().cp() == '>') { + yield new Token(Token.Type.MINUS_GREATER, first, input.next()); + } else { + yield new Token(Token.Type.MINUS, first); + } + } + case '(' -> new Token(Token.Type.PAREN_LEFT, input.next()); + case ')' -> new Token(Token.Type.PAREN_RIGHT, input.next()); + case '+' -> new Token(Token.Type.PLUS, input.next()); + case '?' -> { + var first = input.next(); + if (input.peek().cp() == '=') { + yield new Token(Token.Type.QUESTION_EQUAL, first, input.next()); + } else { + yield new Token(Token.Type.ILLEGAL, first, input.next()); + } + } + case ';' -> new Token(Token.Type.SEMICOLON, input.next()); + case '/' -> { + var first = input.next(); + if (input.peek().cp() == '=') { + yield new Token(Token.Type.SLASH_EQUAL, first, input.next()); + } else { + yield new Token(Token.Type.SLASH, first); + } + } + + case '"' -> new Token(Token.Type.STRING, readString()); + + default -> { + if (isIdentInitial(input.peek())) { + var ident = readIdentifier(); + var type = switch (ident) { + case "and" -> Token.Type.AND; + case "def" -> Token.Type.DEF; + case "do" -> Token.Type.DO; + case "else" -> Token.Type.ELSE; + case "end" -> Token.Type.END; + case "false" -> Token.Type.FALSE; + case "fn" -> Token.Type.FN; + case "if" -> Token.Type.IF; + case "in" -> Token.Type.IN; + case "let" -> Token.Type.LET; + case "then" -> Token.Type.THEN; + case "true" -> Token.Type.TRUE; + default -> Token.Type.IDENTIFIER; + }; + yield new Token(type, ident); + } else if (isNumeral(input.peek())) { + yield new Token(Token.Type.INTEGER, readInteger()); + } else { + yield new Token(Token.Type.ILLEGAL, input.next()); + } + } + }; + } + + private T foldWhile(Predicate pred, T initial, BiFunction combine) { + var res = initial; + var ch = input.peek(); + while (pred.test(ch)) { + res = combine.apply(res, input.next()); + ch = input.peek(); + } + return res; + } + + private String readWhile(Predicate pred) { + return foldWhile(pred, new StringBuilder(), StringBuilder::append).toString(); + } + + private void skipWhile(Predicate pred) { + foldWhile(pred, Object.class, (x, _) -> x); + } + + private String readIdentifier() { + return readWhile(Lexer::isIdentFinal); + } + + private String readInteger() { + return readWhile(Lexer::isNumeral); + } + + private String readString() { + input.next(); + var literal = readWhile(cp -> cp.cp() != '"'); + input.next(); + return literal; + } + + private void skipWhitespace() { + while (true) { + skipWhile(Lexer::isWhitespace); + if (input.peek().cp() != '#') { + return; + } + skipWhile(cp -> !isNewline(cp)); + } + } +} diff --git a/lexer/src/main/java/lv/enes/orang/lexer/Token.java b/lexer/src/main/java/lv/enes/orang/lexer/Token.java new file mode 100644 index 0000000..59626c7 --- /dev/null +++ b/lexer/src/main/java/lv/enes/orang/lexer/Token.java @@ -0,0 +1,62 @@ +package lv.enes.orang.lexer; + +import lv.enes.orang.utils.Codepoint; + +public record Token(Type type, String literal) { + public Token(Type type, Codepoint... cps) { + this(type, codepointsToString(cps)); + } + + private static String codepointsToString(Codepoint... cps) { + var sb = new StringBuilder(cps.length); + for (var cp : cps) { + sb.append(cp); + } + return sb.toString(); + } + + public enum Type { + ILLEGAL, + EOF, + + // Literals + IDENTIFIER, + INTEGER, + STRING, + + // Keywords + AND, + DEF, + DO, + ELSE, + END, + FALSE, + FN, + IF, + IN, + LET, + THEN, + TRUE, + + // Special chars + ASTERISK, + BANG, + BRACKET_LEFT, + BRACKET_RIGHT, + COMMA, + EQUAL, + GREATER, + GREATER_EQUAL, + LESS, + LESS_EQUAL, + MINUS, + MINUS_GREATER, + PAREN_LEFT, + PAREN_RIGHT, + PLUS, + QUESTION_EQUAL, + SEMICOLON, + SLASH, + SLASH_EQUAL, + } +} -- cgit v1.2.3