Implement sample with working preprocessor (Stage 1 + 2)

This commit is contained in:
Yannick Reiß 2025-08-11 12:49:44 +02:00
parent 0b6073b5bb
commit 9e4141fc96
9 changed files with 653 additions and 22 deletions

View File

@ -4,3 +4,6 @@ version = "0.1.0"
edition = "2024"
[dependencies]
gtoml = "0.1.2"
toml = "0.9.3"
regex = "1.11.1"

5
example.mlc Normal file
View File

@ -0,0 +1,5 @@
#with language.toml
variable:=-3; c := (a+b- 3) * 23 + variable; d := c - a;Natural : Number (n) := {n >= 0};faculty : Natural (n) -> Natural := if n = 0 then 1 else faculty (n-1) * n end;
String Natural (n) := {Character * n};hello_word -> String := "Hello World!";
first_letter -> Character := 'a';
wrong -> Logic := false;date -> String := "#date_now";

View File

@ -1,31 +1,77 @@
# Meta rules are separate rules with priority over all other rules.
# They can be compared to preprocessor directives, but are more powerful.
# Pattern matching in preprocessor style, is running at highest priority before anything else.
[meta.replacements]
comments = ["^--.*", ""]
# Interpolation with a shell, replaces the meta pattern by the interpolation result.
# Passing arguments is supported through groups and #<parameter number> in the shell command.
[meta.interpolation]
with = ["^#with ([\\w./]+)", "cat $1"]
date = ["#date_now", "date"]
# Describes tokens to be replaced by identifiers and then later swapped back in after the tokenizer.
# All special tokens are treated as constants
[meta.token]
string_constant = "\".*?\""
char_constant = "'.'"
# Every key below is used as type in an enumerate to sort the tokens
# -> Replacement in order
# -> Every amount of other symbols is saved as some kind of value
# -> Those are using the default type "identifier"
[token]
separator = [" "]
operands = [":=", "->", "!", "+", "-", "/", "*", "(", ")", "[", "]", "{", "}", "=", "?", ":", "'", "\""]
separator = [" ", ",", "\n"]
operands = [":=", "->", "<=", ">=", "<", ">", "!", "+", "-", "/", "*", "(", ")", "[", "]", "{", "}", "=", "?", ":"]
terminator = [";"]
[semantics]
keywords = ["if", "then", "else", "with"]
keywords = ["if", "then", "else", "end", "with"]
# constant descriptions
[constants]
number = "(?:0b[01]+|0x[0-9a-fA-F]+|0[0-7]+|[1-9][0-9]*)"
string = "\"[\w\W]*?\""
character = "'[\w\W]'"
character = "'.'"
logic = "(true|false)"
[types]
Number = ""
Character = ""
Number = "number"
Character = "character"
Type = ""
Logic = ""
Array = "{character * number}"
Logic = "logic"
# List of rules
# Rules can be found in traces
# use better names than rule_1, rule_2, ...
# The compiler will run through all rules trying to match exactly one.
# Uses the following generic types:
# - OPERAND
# - IDENTIFIER
# - KEYWORD
# - TERMINATOR
# - OTHER (Use this type for ambiguous parts. Same as lazy .+ in regular expressions)
# Definition of custom types are possible, by creation of a rule with the same name.
# IMPORTANT: Rules are always top priority and can overwrite other types.
# Named placeholders: The character # is reserved for named placeholders. They are only valid inside a rule.
[syntax]
definition = "IDENTIFIER#1 -> IDENTIFIER#2 := OTHER#3 TERMINATOR"
definition_with_parameter = "IDENTIFIER#1 : parameter#2 -> IDENTIFIER#3 := OTHER#4 TERMINATOR"
recursion = "#basename OTHER := OTHER #basename OTHER TERMINATOR"
replace_predef = [ "IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR OTHER IDENTIFIER#1", "#1 -> OTHER := #2 TERMINATOR OTHER (#2)" ]
replace_postdef = [ "IDENTIFIER#1 OTHER TERMINATOR IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR", "#2 OTHER TERMINATOR #1 -> OTHER := #2 TERMINATOR" ]
unfold_parameter = [ ": OTHER IDENTIFIER#1 ( IDENTIFIER#2 OTHER#3 ) OTHER ->", ": OTHER #1 #2 #1 ( #3 ) OTHER ->" ]
unfold_parameter_remove_brackets = [ ": OTHER IDENTIFIER ( ) OTHER ->", ": OTHER OTHER ->" ]
parameter = ": OTHER ->"
[hdl]
[compiled]
# The following sections are used to build different output formats
# [interpreter] refers to the builtin interpreter using a minimal subset of C syntax
# The name of each section is only used to specify the actual output.
[clang]
definition = "#2 #1 () {return (#3);}"
Logic = "int"
Number = "long int"
Character = "char"
Type = "struct"
[interpreter]

75
src/identification.rs Normal file
View File

@ -0,0 +1,75 @@
use crate::Token;
use regex::Regex;
use toml::{Table, Value};
// Identifier
// Each Identifier is analyzed to be worked with.
pub struct Identifier {
identities: Vec<Identity>,
pub tokens: Vec<Token>,
type_configuration: Table,
constant_configuration: Table,
}
// Identity
// The Identity of a identifier token.
struct Identity {
class: IdentityClass,
name: String,
sub_type: String,
}
pub enum IdentityClass {
TYPE,
CONSTANT,
DEFINITION,
}
impl Identifier {
pub fn new(token: Vec<Token>) -> Identifier {
let identities: Vec<Identity> = vec![];
let new_config_type: Table = Table::new();
let new_config_constant: Table = Table::new();
Identifier {
identities: identities,
tokens: token,
type_configuration: new_config_type,
constant_configuration: new_config_constant,
}
}
pub fn load_criteria_from_configuration(&mut self, complete_configuration: Table) {
let type_configuration_wrapped: &Value = complete_configuration
.get("types")
.expect("Missing section types in configuration.");
let constant_configuration_wrapped: &Value = complete_configuration
.get("constants")
.expect("Missing section constants in configuration.");
let type_configuration: Table = Table::try_from(type_configuration_wrapped)
.expect("Can't read type configuration from Value.");
let constant_configuration: Table = Table::try_from(constant_configuration_wrapped)
.expect("Can't read constant configuration from Value.");
self.type_configuration = type_configuration;
self.constant_configuration = constant_configuration;
}
pub fn identify_identifiers(&mut self) {
let tokens: &Vec<Token> = &self.tokens;
let constant_patterns: Table = self.constant_configuration.clone();
let type_names: Table = self.type_configuration.clone();
let mut identity_found = false;
for token in tokens.iter() {
if token.token_type == crate::TokenType::IDENTIFIER {
for raw_pattern in constant_patterns.iter() {
let pattern: &str = raw_pattern.1.as_str().unwrap();
let expression: Regex = Regex::new(pattern).unwrap();
// Check for constant
if expression.is_match(token.token.as_str()) {
println!("Matching! Found {:?} {:?}.", raw_pattern.0, token.token);
}
}
}
}
}
}

View File

@ -1,16 +1,26 @@
mod collector;
mod identification;
mod preprocessor;
mod testcases;
mod tokenizer;
use collector::Collector;
use tokenizer::*;
fn main() {
let mut _collector: Collector = Collector {
definitions: vec![(String::from(""), String::from(""))],
arguments: vec![(String::from(""), String::from(""))],
};
let sample_code: String = std::fs::read_to_string("example.mlc").unwrap();
let mut example_tokenizer: Tokenizer = Tokenizer::new();
let mut meta_rules: crate::preprocessor::MetaRules =
crate::preprocessor::MetaRules::new("./language.toml");
let processed_sample_code: String = meta_rules.process(sample_code.to_owned());
example_tokenizer.read_configuration_from_file("./language.toml");
example_tokenizer.eat(processed_sample_code.as_str());
example_tokenizer.identify_tokens();
let test_string: String = String::from("(1 + 2) * 3");
let mut example_identifier: identification::Identifier =
identification::Identifier::new(example_tokenizer.tokens);
example_identifier.load_criteria_from_configuration(example_tokenizer.configuration);
example_identifier.identify_identifiers();
let echo_string: String = _collector.eval(test_string);
println!("Result: {}", echo_string);
for token in example_identifier.tokens.iter() {
print!("{}", token.token);
}
}

148
src/preprocessor.rs Normal file
View File

@ -0,0 +1,148 @@
use crate::tokenizer::Token;
use regex::{Captures, Regex};
use toml::{Table, Value};
// MetaRules
// Struct containing all meta rules.
pub struct MetaRules {
replacement_rules: Vec<(String, (String, String))>,
interpolation_rules: Vec<(String, (String, String))>,
token_rules: Vec<(String, String)>,
special_tokens: Vec<Token>,
}
// Implementation of MetaRules
// Trait implementation
impl MetaRules {
// @name new
// @return MetaRules
// @brief Create a new rule struct by reading from a configuration file.
// @param configuration_filename: &str
pub fn new(configuration_filename: &str) -> MetaRules {
let configuration_content: String = std::fs::read_to_string(configuration_filename)
.expect("[ERROR] Could not open configuration file!");
let mut replacements: Vec<(String, (String, String))> = vec![];
let mut interpolation: Vec<(String, (String, String))> = vec![];
let mut meta_token_rules: Vec<(String, String)> = vec![];
let meta_tokens: Vec<Token> = vec![];
let configuration = gtoml::parse(configuration_content.as_str())
.expect("[ERROR] TOML invalid in preprocessor!");
let configuration_unpacked: Table = Table::try_from(configuration).unwrap();
let meta_configuration: Table = match configuration_unpacked.get("meta") {
Some(config) => config.as_table().unwrap().clone(),
None => Table::new(),
};
if !meta_configuration.is_empty() {
if meta_configuration.contains_key("replacements") {
println!("[INFO] Found replacement rules.");
let replacement_rules: Table = meta_configuration
.get("replacements")
.unwrap()
.as_table()
.unwrap()
.clone();
for key in replacement_rules.keys() {
let value: Vec<Value> = replacement_rules
.get(key)
.unwrap()
.as_array()
.unwrap()
.clone();
let name: String = key.clone();
let pattern: String = value[0].as_str().unwrap().to_owned();
let replacement: String = value[1].as_str().unwrap().to_owned();
replacements.push((name, (pattern, replacement)));
}
}
if meta_configuration.contains_key("interpolation") {
println!("[INFO] Found interpolation rules.");
let interpolation_rules: Table = meta_configuration
.get("interpolation")
.unwrap()
.as_table()
.unwrap()
.clone();
for key in interpolation_rules.keys() {
let value: Vec<Value> = interpolation_rules
.get(key)
.unwrap()
.as_array()
.unwrap()
.clone();
let name: String = key.clone();
let pattern: String = value[0].as_str().unwrap().to_owned();
let cmd: &str = value[1].as_str().unwrap();
interpolation.push((name, (pattern, String::from(cmd))));
}
}
if meta_configuration.contains_key("token") {
println!("[INFO] Found token rules.");
let token_rules: Table = meta_configuration
.get("token")
.unwrap()
.as_table()
.unwrap()
.clone();
for rule in token_rules.keys() {
let pattern: String =
token_rules.get(rule).unwrap().as_str().unwrap().to_owned();
meta_token_rules.push((rule.clone(), pattern));
}
}
} else {
println!("[WARNING] No meta configuration, skipping preprocessor.");
}
MetaRules {
replacement_rules: replacements,
interpolation_rules: interpolation,
token_rules: meta_token_rules,
special_tokens: meta_tokens,
}
}
// @name process
// @return String
// @brief Run preprocessor on raw code.
// @param rule_set: MetaRules, raw_code: String
pub fn process(&mut self, raw_code: String) -> String {
let mut processed_code: String = raw_code.clone();
// replacement rules
for rule in self.replacement_rules.iter() {
println!("[INFO] Applying rule {}", rule.0);
let base_pattern: Regex = Regex::new((rule.1 .0).as_str()).unwrap();
processed_code = base_pattern
.replace_all(processed_code.as_str(), rule.1 .1.as_str())
.to_string();
}
// interpolation rules
for rule in self.interpolation_rules.iter() {
println!("[INFO] Applying rule {}", rule.0);
let base_pattern: Regex = Regex::new((rule.1 .0).as_str()).unwrap();
let processed_code_replacement = processed_code.clone();
let parameter = &base_pattern
.captures(processed_code_replacement.as_str())
.unwrap()[0];
let command: &str = &base_pattern.replace(parameter, rule.1 .1.as_str());
println!("{:?}", &command);
let subprocess = std::process::Command::new("/bin/bash")
.arg("-c")
.arg(String::from("echo \"$(") + command + ")\"")
.output()
.expect((String::from("") + "Failed to run command " + command + "!").as_str());
processed_code = base_pattern
.replace(
processed_code.as_str(),
String::from_utf8(subprocess.stdout).unwrap(),
)
.to_string();
}
return processed_code;
}
}

31
src/structure.rs Normal file
View File

@ -0,0 +1,31 @@
// HeadStructure
// Top level of structure.
pub struct HeadStructure {
token: TokenConfiguration,
syntax: SyntaxConfiguration,
semantics: SemanticsConfiguration,
types: TypesConfiguration,
hdl: HdlConfiguration,
compiled: CompiledConfiguration,
interpreter: InterpreterConfiguration,
}
pub struct TokenConfiguration {
separator: Vec<String>,
operands: Vec<String>,
terminator: Vec<String>,
}
pub struct SyntaxConfiguration {
keywords: Vec<String>,
}
pub struct SemanticsConfiguration {}
pub struct TypesConfiguration {}
pub struct HdlConfiguration {}
pub struct CompiledConfiguration {}
pub struct InterpreterConfiguration {}

9
src/testcases.rs Normal file
View File

@ -0,0 +1,9 @@
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn dummy_test() {
assert_eq!(2, 2);
}
}

304
src/tokenizer.rs Normal file
View File

@ -0,0 +1,304 @@
use std::fs;
use toml::{Table, Value};
#[derive(PartialEq)]
pub enum TokenType {
OPERAND,
TERMINATOR,
IDENTIFIER,
KEYWORD,
}
// Tokenizer
// Tokenizer and underlying functions to turn code into tokens
pub struct Tokenizer {
pub token_list: Vec<String>,
// BUG:
pub tokens: Vec<Token>,
// Grammar options from toml file
pub configuration: Table,
}
// Token
// This is a token with a token type.
pub struct Token {
pub token: String,
pub token_type: TokenType,
}
// Implementation of Tokenizer
// Functions associated with the tokenizer struct and module.
impl Tokenizer {
// @name read_configuration_from_file
// @return
// @brief Try to read configuration from an external file
// @param &mut self, configuration_filename: &str
pub fn read_configuration_from_file(&mut self, configuration_filename: &str) {
let configuration_string: String = fs::read_to_string(configuration_filename).expect(
(String::from("Could not open configuration file at: ") + configuration_filename)
.as_str(),
);
let configuration = gtoml::parse(configuration_string.as_str()).expect("TOML invalid!");
self.configuration = Table::try_from(configuration).unwrap();
// Check for token section in config, panic if not present
if !self.configuration.contains_key("token") {
panic!("Token section is not present!");
}
// Check for semantics section in config, panic if not present
if !self.configuration.contains_key("semantics") {
panic!("Section semantics is not present!");
}
}
// @name new
// @return Tokenizer
// @brief Create a new Tokenizer
// @param
pub fn new() -> Tokenizer {
let empty_tokens: Vec<Token> = vec![];
let empty_value: toml::map::Map<String, Value> = toml::map::Map::new();
let empty_token_list: Vec<String> = vec![];
Tokenizer {
tokens: empty_tokens,
token_list: empty_token_list,
configuration: empty_value,
}
}
// @name eat
// @return
// @brief Consumes a string and safes the tokens
// @param line: &str
pub fn eat(&mut self, line: &str) {
// Get token vectors from configuration
let token_table_value: &Value = self.configuration.get("token").unwrap();
let token_table: Table = Table::try_from(token_table_value).unwrap();
let mut tokens: Vec<String> = vec![line.to_string()];
let mut new_tokens: Vec<String> = vec![];
let mut token_buffer: String = String::from("");
// Iterate over tokens in token table and split tokens.
if token_table.contains_key("separator") {
let separator: Vec<Value> = token_table
.get_key_value("separator")
.unwrap()
.1
.as_array()
.unwrap()
.clone();
if separator.len() > 0 {
for token in tokens.iter() {
let mut token_feed = token.clone();
while !token_feed.is_empty() {
let mut no_match: bool = true;
for sep in separator.iter() {
if token_feed.starts_with(sep.as_str().unwrap()) {
// Reset and add token
no_match = false;
if token_buffer.len() > 0 {
new_tokens.push(token_buffer.clone());
token_buffer = String::from("");
}
let new_feed: String =
token_feed.split_off(sep.as_str().unwrap().len());
token_feed = new_feed;
}
}
if no_match {
let new_feed: String = token_feed.split_off(1);
token_buffer = token_buffer
+ String::from(token_feed.chars().next().unwrap()).as_str();
token_feed = new_feed;
}
}
// empty token
new_tokens.push(token_buffer.clone());
token_buffer = String::from("");
}
// empty token
new_tokens.push(token_buffer.clone());
token_buffer = String::from("");
}
}
tokens = new_tokens.clone();
new_tokens = vec![];
if token_table.contains_key("operands") {
let operands: Vec<Value> = token_table
.get_key_value("operands")
.unwrap()
.1
.as_array()
.unwrap()
.clone();
if operands.len() > 0 {
for token in tokens.iter() {
let mut token_feed = token.clone();
while !token_feed.is_empty() {
let mut no_match: bool = true;
for op in operands.iter() {
if token_feed.starts_with(op.as_str().unwrap()) {
// Reset and add token
no_match = false;
if token_buffer.len() > 0 {
new_tokens.push(token_buffer.clone());
}
token_buffer = String::from("");
new_tokens.push(op.as_str().unwrap().to_string());
let new_feed: String =
token_feed.split_off(op.as_str().unwrap().len());
token_feed = new_feed;
}
}
if no_match {
let new_feed: String = token_feed.split_off(1);
token_buffer = token_buffer
+ String::from(token_feed.chars().next().unwrap()).as_str();
token_feed = new_feed;
}
}
// empty token
new_tokens.push(token_buffer.clone());
token_buffer = String::from("");
}
// empty token
new_tokens.push(token_buffer.clone());
token_buffer = String::from("");
}
}
tokens = new_tokens.clone();
new_tokens = vec![];
if token_table.contains_key("terminator") {
let terminator: Vec<Value> = token_table
.get_key_value("terminator")
.unwrap()
.1
.as_array()
.unwrap()
.clone();
if terminator.len() > 0 {
for token in tokens.iter() {
let mut token_feed = token.clone();
while !token_feed.is_empty() {
let mut no_match: bool = true;
for term in terminator.iter() {
if token_feed.starts_with(term.as_str().unwrap()) {
// Reset and add token
no_match = false;
if token_buffer.len() > 0 {
new_tokens.push(token_buffer.clone());
}
token_buffer = String::from("");
new_tokens.push(term.as_str().unwrap().to_string());
let new_feed: String =
token_feed.split_off(term.as_str().unwrap().len());
token_feed = new_feed;
}
}
if no_match {
let new_feed: String = token_feed.split_off(1);
token_buffer = token_buffer
+ String::from(token_feed.chars().next().unwrap()).as_str();
token_feed = new_feed;
}
}
// empty token as token ended
new_tokens.push(token_buffer.clone());
token_buffer = String::from("");
}
// empty token
new_tokens.push(token_buffer.clone());
}
}
self.token_list.append(&mut new_tokens);
}
// @name identify_tokens
// @return
// @brief Go through all tokens and try to find them.
// @param &mut self
pub fn identify_tokens(&mut self) {
// Go through token list
let mut token_identities: Vec<Token> = vec![];
let mut found_token: bool;
let token_section: Table =
Table::try_from(self.configuration.get("token").unwrap()).unwrap();
let semantics_section: Table =
Table::try_from(self.configuration.get("semantics").unwrap()).unwrap();
for token in self.token_list.iter() {
found_token = false;
if token.as_str() == "" {
continue;
}
// Check if token is an operand
if token_section.contains_key("operands") {
let operands: Vec<Value> = token_section
.get_key_value("operands")
.unwrap()
.1
.as_array()
.unwrap()
.clone();
for operand in operands.iter() {
if operand.as_str().unwrap() == token.as_str() {
token_identities.push(Token {
token: token.clone(),
token_type: TokenType::OPERAND,
});
found_token = true;
}
}
}
if token_section.contains_key("terminator") && !found_token {
let terminator: Vec<Value> = token_section
.get_key_value("terminator")
.unwrap()
.1
.as_array()
.unwrap()
.clone();
for term in terminator.iter() {
if term.as_str().unwrap() == token.as_str() {
token_identities.push(Token {
token: token.clone(),
token_type: TokenType::TERMINATOR,
});
found_token = true;
}
}
}
if semantics_section.contains_key("keywords") && !found_token {
let keywords: Vec<Value> = semantics_section
.get_key_value("keywords")
.unwrap()
.1
.as_array()
.unwrap()
.clone();
for keyword in keywords.iter() {
if keyword.as_str().unwrap() == token.as_str() {
token_identities.push(Token {
token: token.clone(),
token_type: TokenType::KEYWORD,
});
found_token = true;
}
}
}
if !found_token {
token_identities.push(Token {
token: token.clone(),
token_type: TokenType::IDENTIFIER,
});
}
}
self.tokens = token_identities;
}
}