Load configuration for in language syntax replacements

Add testbench
Reintroducing meta tokens
2025-08-26 22:43:59 +02:00 · 2025-08-25 12:09:54 +02:00 · 2025-08-25 07:12:22 +02:00 · 2025-08-24 20:54:20 +02:00 · 2025-08-12 19:04:09 +02:00
9 changed files with 408 additions and 28 deletions
--- a/example.mlc
+++ b/example.mlc
@@ -1,5 +1,3 @@
-#with language.toml
 variable:=-3; c := (a+b- 3) * 23 + variable; d := c - a;Natural : Number (n) := {n >= 0};faculty : Natural (n) -> Natural := if n = 0 then 1 else faculty (n-1) * n end;
 String Natural (n) := {Character * n};hello_word -> String := "Hello World!";
 first_letter -> Character := 'a';
-wrong -> Logic := false;date -> String := "#date_now";
--- a/language.toml
+++ b/language.toml
@@ -10,6 +10,8 @@ comments = ["^--.*", ""]
 [meta.interpolation]
 with = ["^#with ([\\w./]+)", "cat $1"]
 date = ["#date_now", "date"]
+user = ["#user", "user"]
+test = ["#test", "cat ./mathlib.mlc"]

 # Describes tokens to be replaced by identifiers and then later swapped back in after the tokenizer.
 # All special tokens are treated as constants
@@ -27,7 +29,7 @@ operands = [":=", "->", "<=", ">=", "<", ">", "!", "+", "-", "/", "*", "(", ")",
 terminator = [";"]

 [semantics]
-keywords = ["if", "then", "else", "end", "with"]
+keywords = ["if", "then", "else", "end"]

 [constants]
 number = "(?:0b[01]+|0x[0-9a-fA-F]+|0[0-7]+|[1-9][0-9]*)"
--- a/mathlib.mlc
+++ b/mathlib.mlc
@@ -0,0 +1,4 @@
+Sigma -> Array := {0, 1, 2};
+N -> Array := {3};
+P -> Array := {3 -> 012};
+S -> Number := 3;
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,26 +1,52 @@
-mod identification;
 mod preprocessor;
+mod syntax;
 mod testcases;
 mod tokenizer;

 use tokenizer::*;

 fn main() {
-    let sample_code: String = std::fs::read_to_string("example.mlc").unwrap();
-    let mut example_tokenizer: Tokenizer = Tokenizer::new();
+    // CL-Wrapper
+    let args: Vec<String> = std::env::args().collect();
+
+    // Adjust to following principle:
+    //  micro [-t <target>] [-l <language.toml>] [<list of source files>]
+    //  -t default: first found
+    //  -l default: language.toml
+    //
+    //  Either loads all source files or takes stdin input by piping code into the program
+    let mut raw_source_code: String = String::from("");
+    for i in 1..args.len() {
+        raw_source_code = raw_source_code
+            + std::fs::read_to_string(args[i].clone())
+                .expect("Source file not found!")
+                .as_str();
+    }
+
+    // Load language toml
    let mut meta_rules: crate::preprocessor::MetaRules =
        crate::preprocessor::MetaRules::new("./language.toml");
-    let processed_sample_code: String = meta_rules.process(sample_code.to_owned());
-    example_tokenizer.read_configuration_from_file("./language.toml");
-    example_tokenizer.eat(processed_sample_code.as_str());
-    example_tokenizer.identify_tokens();
+    let mut tokenizer_configuration: Tokenizer = Tokenizer::new();
+    tokenizer_configuration.read_configuration_from_file("./language.toml");

-    let mut example_identifier: identification::Identifier =
-        identification::Identifier::new(example_tokenizer.tokens);
-    example_identifier.load_criteria_from_configuration(example_tokenizer.configuration);
-    example_identifier.identify_identifiers();
+    // Run preprocessor
+    let preprocessed_source_code: String = meta_rules.process(raw_source_code);

-    for token in example_identifier.tokens.iter() {
-        print!("{}", token.token);
+    // Tokenizing
+    tokenizer_configuration.eat(preprocessed_source_code.as_str());
+    tokenizer_configuration.identify_tokens();
+    //   Reintroducing meta_tokens
+    for meta_token in meta_rules.special_tokens.iter() {
+        // Go through all tokens
+        for i in 0..tokenizer_configuration.tokens.len() {
+            if meta_token.0 == tokenizer_configuration.tokens[i].token {
+                tokenizer_configuration.tokens[i] = meta_token.1.clone();
+                break;
+            }
+        }
    }
+
+    // Syntax resolving
+
+    // Apply translation
 }
--- a/src/preprocessor.rs
+++ b/src/preprocessor.rs
@@ -1,5 +1,5 @@
 use crate::tokenizer::Token;
-use regex::{Captures, Regex};
+use regex::{Captures, Match, Regex};
 use toml::{Table, Value};

 // MetaRules
@@ -8,7 +8,7 @@ pub struct MetaRules {
    replacement_rules: Vec<(String, (String, String))>,
    interpolation_rules: Vec<(String, (String, String))>,
    token_rules: Vec<(String, String)>,
-    special_tokens: Vec<Token>,
+    pub special_tokens: Vec<(String, Token)>,
 }

 // Implementation of MetaRules
@@ -24,7 +24,7 @@ impl MetaRules {
        let mut replacements: Vec<(String, (String, String))> = vec![];
        let mut interpolation: Vec<(String, (String, String))> = vec![];
        let mut meta_token_rules: Vec<(String, String)> = vec![];
-        let meta_tokens: Vec<Token> = vec![];
+        let meta_tokens: Vec<(String, Token)> = vec![];
        let configuration = gtoml::parse(configuration_content.as_str())
            .expect("[ERROR] TOML invalid in preprocessor!");
        let configuration_unpacked: Table = Table::try_from(configuration).unwrap();
@@ -125,11 +125,15 @@ impl MetaRules {
            println!("[INFO] Applying rule {}", rule.0);
            let base_pattern: Regex = Regex::new((rule.1 .0).as_str()).unwrap();
            let processed_code_replacement = processed_code.clone();
-            let parameter = &base_pattern
-                .captures(processed_code_replacement.as_str())
-                .unwrap()[0];
-            let command: &str = &base_pattern.replace(parameter, rule.1 .1.as_str());
-            println!("{:?}", &command);
+
+            let captures: Option<Captures> =
+                base_pattern.captures(processed_code_replacement.as_str());
+            let directive: String;
+            match captures {
+                Some(n) => directive = n.get(0).map_or("", |m| m.as_str()).to_string(),
+                None => continue,
+            };
+            let command: &str = &base_pattern.replace(directive.as_str(), rule.1 .1.as_str());
            let subprocess = std::process::Command::new("/bin/bash")
                .arg("-c")
                .arg(String::from("echo \"$(") + command + ")\"")
@@ -143,6 +147,42 @@ impl MetaRules {
                .to_string();
        }

+        for token_style in self.token_rules.iter() {
+            println!("[INFO] Searching meta tokens of style {}", token_style.0);
+
+            // Search all occurrences
+            let token_pattern: Regex =
+                Regex::new(token_style.1.as_str()).expect("Could not assign pattern.");
+            let match_list: Match;
+            match_list = match token_pattern.find(processed_code.as_str()) {
+                Some(n) => n,
+                None => continue,
+            };
+
+            // Create id for each occurrence
+            let meta_id: String = String::from("meta_token_")
+                + match_list.start().to_string().as_str()
+                + "__"
+                + match_list.end().to_string().as_str();
+
+            // Replace token by id
+            let meta_value: String = match_list.as_str().to_string();
+            let value_regex: Regex =
+                Regex::new(meta_value.as_str()).expect("Could not create pattern.");
+            processed_code = value_regex
+                .replace(processed_code.as_str(), meta_id.as_str())
+                .to_string();
+
+            // Safe id and token
+            self.special_tokens.push((
+                meta_id,
+                Token {
+                    token: meta_value,
+                    token_type: crate::TokenType::IDENTIFIER,
+                },
+            ));
+        }
+
        return processed_code;
    }
 }
--- a/src/syntax.rs
+++ b/src/syntax.rs
@@ -0,0 +1,76 @@
+use toml::{Table, Value};
+
+// SyntaxRule
+// Implementation of a syntax rule that can be applied.
+#[derive(Debug)]
+pub struct SyntaxRule {
+    pub name: String,
+    pub left: String,
+    pub right: String,
+}
+
+// Implementation of SyntaxRule
+// Load and Resolve from outside
+impl SyntaxRule {
+    // @name new
+    // @return SyntaxRule
+    // @brief Create a new syntax rule / load rule set.
+    // @param name_: String, left_: String, right_: String
+    fn new(name_: String, left_: String, right_: String) -> SyntaxRule {
+        SyntaxRule {
+            name: String::new(),
+            left: String::new(),
+            right: String::new(),
+        }
+    }
+
+    // @name load
+    // @return Vec<SyntaxRule>
+    // @brief Load configuration and retrieve transformation rules.
+    // @param configuration_filename: &str
+    pub fn load(configuration_filename: &str) -> Vec<SyntaxRule> {
+        let mut rules: Vec<SyntaxRule> = vec![];
+        let configuration_content: String = std::fs::read_to_string(configuration_filename)
+            .expect("[ERROR] Could not open configuration file!");
+        let configuration = gtoml::parse(configuration_content.as_str())
+            .expect("[ERROR] TOML invalid in preprocessor!");
+        let configuration_unpacked: Table = Table::try_from(configuration).unwrap();
+
+        let syntax_definitions: Table = match configuration_unpacked.get("syntax") {
+            Some(config) => config.as_table().unwrap().clone(),
+            None => Table::new(),
+        };
+
+        for key in syntax_definitions.keys() {
+            let rule: Value = syntax_definitions.get(key).unwrap().clone();
+            if rule.is_array() {
+                let rule_array = rule.as_array().unwrap();
+                let left: String = rule_array[0].to_string();
+                let right: String = rule_array[1].to_string();
+                rules.push(SyntaxRule {
+                    name: key.to_string(),
+                    left: left,
+                    right: right,
+                });
+            }
+        }
+
+        rules
+    }
+
+    // @name resolve
+    // @return String
+    // @brief Applies all rules until none of them can be applied again.
+    // @param rules: Vec<SyntaxRule>, unsolved: String
+    pub fn resolve(rules: Vec<SyntaxRule>, unsolved: String) -> String {
+        String::new()
+    }
+
+    // @name transform
+    // @return String
+    // @brief Applies a rule.
+    // @param &mut self, unformed: String
+    fn transform(&mut self, unformed: String) -> String {
+        String::new()
+    }
+}
--- a/src/testcases.rs
+++ b/src/testcases.rs
@@ -1,9 +1,138 @@
 #[cfg(test)]
 mod tests {
-    use super::*;
+
+    // preprocessor
+    #[test]
+    fn test_replacements() {
+        let mut ruleset: crate::preprocessor::MetaRules =
+            crate::preprocessor::MetaRules::new("./testspecs.toml");
+        let sut: String = ruleset.process(String::from("-- Comment to remove"));
+        let verify: String = String::from("");
+        let case_comment_at_end: String =
+            ruleset.process(String::from("This -- comment is not removed."));
+        let case_comment_at_end_verify: String = String::from("This -- comment is not removed.");
+        assert_eq!(sut, verify);
+        assert_eq!(case_comment_at_end, case_comment_at_end_verify);
+    }

    #[test]
-    fn dummy_test() {
-        assert_eq!(2, 2);
+    fn test_interpolation() {
+        let mut ruleset: crate::preprocessor::MetaRules =
+            crate::preprocessor::MetaRules::new("./testspecs.toml");
+        let run_with_interpolation_test: String = ruleset.process(String::from("#test"));
+        let interpolation_verification: String = std::fs::read_to_string("./mathlib.mlc").unwrap();
+
+        assert_eq!(run_with_interpolation_test, interpolation_verification);
+    }
+
+    #[test]
+    fn test_meta_token() {
+        let mut ruleset: crate::preprocessor::MetaRules =
+            crate::preprocessor::MetaRules::new("./testspecs.toml");
+        let meta_token_test_string: String = ruleset.process(String::from("\"sample\""));
+        let meta_token_sample_string: String = String::from("\"sample\"");
+        let meta_token_verify: Vec<crate::tokenizer::Token> = vec![crate::tokenizer::Token {
+            token: meta_token_sample_string,
+            token_type: crate::tokenizer::TokenType::IDENTIFIER,
+        }];
+        assert_eq!(meta_token_verify.len(), ruleset.special_tokens.len());
+        assert_eq!(
+            meta_token_verify[0].token,
+            ruleset.special_tokens[0].1.token
+        );
+        assert_eq!(meta_token_test_string, "meta_token_0__8");
+    }
+
+    // Tokenizer
+    #[test]
+    fn test_eat() {
+        let mut sample: crate::tokenizer::Tokenizer = crate::tokenizer::Tokenizer::new();
+        sample.read_configuration_from_file("./testspecs.toml");
+        sample.eat("faculty : Natural n := if n = 0 then 1 else n * faculty (n - 1);");
+
+        assert_eq!(
+            sample.token_list,
+            vec![
+                "faculty", ":", "Natural", "n", ":=", "if", "n", "=", "0", "then", "1", "else",
+                "n", "*", "faculty", "(", "n", "-", "1", ")", ";"
+            ]
+        )
+    }
+
+    #[test]
+    fn test_identify_tokens() {
+        let mut token_sample: crate::tokenizer::Tokenizer = crate::tokenizer::Tokenizer::new();
+        token_sample.read_configuration_from_file("./testspecs.toml");
+        token_sample.eat("id : -> 125;");
+        token_sample.identify_tokens();
+
+        let mut token_verify: crate::tokenizer::Tokenizer = crate::tokenizer::Tokenizer::new();
+        token_verify.read_configuration_from_file("./testspecs.toml");
+        token_verify.eat("id : -> 125;");
+
+        token_verify.tokens = vec![
+            crate::tokenizer::Token {
+                token: String::from("id"),
+                token_type: crate::tokenizer::TokenType::IDENTIFIER,
+            },
+            crate::tokenizer::Token {
+                token: String::from(":"),
+                token_type: crate::tokenizer::TokenType::OPERAND,
+            },
+            crate::tokenizer::Token {
+                token: String::from("->"),
+                token_type: crate::tokenizer::TokenType::OPERAND,
+            },
+            crate::tokenizer::Token {
+                token: String::from("125"),
+                token_type: crate::tokenizer::TokenType::IDENTIFIER,
+            },
+            crate::tokenizer::Token {
+                token: String::from(";"),
+                token_type: crate::tokenizer::TokenType::TERMINATOR,
+            },
+        ];
+
+        assert_eq!(token_sample.configuration, token_verify.configuration);
+        assert_eq!(token_sample.tokens.len(), token_verify.tokens.len());
+        assert_eq!(token_sample.token_list.len(), token_verify.token_list.len());
+    }
+
+    // @name test_syntax_load
+    // @return
+    // @brief
+    // @param
+    #[test]
+    fn test_syntax_load() {
+        let test: Vec<crate::syntax::SyntaxRule> =
+            crate::syntax::SyntaxRule::load("./testspecs.toml");
+        let verify: Vec<crate::syntax::SyntaxRule> = vec![
+            crate::syntax::SyntaxRule {
+                name: String::from("replace_predef"),
+                left: String::from(
+                    "IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR OTHER IDENTIFIER#1",
+                ),
+                right: String::from("#1 -> OTHER := #2 TERMINATOR OTHER (#2)"),
+            },
+            crate::syntax::SyntaxRule {
+                name: String::from("replace_postdef"),
+                left: String::from(
+                    "IDENTIFIER#1 OTHER TERMINATOR IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR",
+                ),
+                right: String::from("#2 OTHER TERMINATOR #1 -> OTHER := #2 TERMINATOR"),
+            },
+            crate::syntax::SyntaxRule {
+                name: String::from("unfold_parameter"),
+                left: String::from(": OTHER IDENTIFIER#1 ( IDENTIFIER#2 OTHER#3 ) OTHER ->"),
+                right: String::from(": OTHER #1 #2 #1 ( #3 ) OTHER ->"),
+            },
+            crate::syntax::SyntaxRule {
+                name: String::from("unfold_parameter_remove_brackets"),
+                left: String::from(": OTHER IDENTIFIER ( ) OTHER ->"),
+                right: String::from(": OTHER OTHER ->"),
+            },
+        ];
+
+        assert_eq!(test.len(), verify.len());
    }
 }
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1,7 +1,7 @@
 use std::fs;
 use toml::{Table, Value};

-#[derive(PartialEq)]
+#[derive(PartialEq, Debug)]
 pub enum TokenType {
    OPERAND,
    TERMINATOR,
@@ -21,10 +21,27 @@ pub struct Tokenizer {

 // Token
 // This is a token with a token type.
+#[derive(Debug)]
 pub struct Token {
    pub token: String,
    pub token_type: TokenType,
 }
+
+impl Clone for Token {
+    fn clone(&self) -> Token {
+        let token_type: TokenType = match self.token_type {
+            TokenType::OPERAND => TokenType::OPERAND,
+            TokenType::KEYWORD => TokenType::KEYWORD,
+            TokenType::TERMINATOR => TokenType::TERMINATOR,
+            TokenType::IDENTIFIER => TokenType::IDENTIFIER,
+        };
+        Token {
+            token: self.token.clone(),
+            token_type: token_type,
+        }
+    }
+}
+
 // Implementation of Tokenizer
 // Functions associated with the tokenizer struct and module.
 impl Tokenizer {
@@ -212,6 +229,15 @@ impl Tokenizer {
            }
        }
        self.token_list.append(&mut new_tokens);
+
+        // Clean up token list
+        let mut cleaned_token_list: Vec<String> = vec![];
+        for token in self.token_list.iter() {
+            if token.as_str() != "" {
+                cleaned_token_list.push(token.to_string());
+            }
+        }
+        self.token_list = cleaned_token_list;
    }

    // @name identify_tokens
--- a/testspecs.toml
+++ b/testspecs.toml
@@ -0,0 +1,79 @@
+# Meta rules are separate rules with priority over all other rules.
+# They can be compared to preprocessor directives, but are more powerful.
+
+# Pattern matching in preprocessor style, is running at highest priority before anything else.
+[meta.replacements]
+comments = ["^--.*", ""]
+
+# Interpolation with a shell, replaces the meta pattern by the interpolation result.
+# Passing arguments is supported through groups and #<parameter number> in the shell command.
+[meta.interpolation]
+with = ["^#with ([\\w./]+)", "cat $1"]
+date = ["#date_now", "date"]
+user = ["#user", "user"]
+test = ["#test", "cat ./mathlib.mlc"]
+
+# Describes tokens to be replaced by identifiers and then later swapped back in after the tokenizer.
+# All special tokens are treated as constants
+[meta.token]
+string_constant = "\".*?\""
+char_constant = "'.'"
+
+# Every key below is used as type in an enumerate to sort the tokens
+#  -> Replacement in order
+#  -> Every amount of other symbols is saved as some kind of value
+#  -> Those are using the default type "identifier"
+[token]
+separator = [" ", ",", "\n"]
+operands = [":=", "->", "<=", ">=", "<", ">", "!", "+", "-", "/", "*", "(", ")", "[", "]", "{", "}", "=", "?", ":"]
+terminator = [";"]
+
+[semantics]
+keywords = ["if", "then", "else", "end"]
+
+[constants]
+number = "(?:0b[01]+|0x[0-9a-fA-F]+|0[0-7]+|[1-9][0-9]*)"
+character = "'.'"
+logic = "(true|false)"
+
+[types]
+Number = "number"
+Character = "character"
+Type = ""
+Array = "{character * number}"
+Logic = "logic"
+
+# List of rules
+# Rules can be found in traces
+# use better names than rule_1, rule_2, ...
+# The compiler will run through all rules trying to match exactly one.
+# Uses the following generic types:
+# - OPERAND
+# - IDENTIFIER
+# - KEYWORD
+# - TERMINATOR
+# - OTHER (Use this type for ambiguous parts. Same as lazy .+ in regular expressions)
+# Definition of custom types are possible, by creation of a rule with the same name.
+# IMPORTANT: Rules are always top priority and can overwrite other types.
+# Named placeholders: The character # is reserved for named placeholders. They are only valid inside a rule.
+[syntax]
+definition = "IDENTIFIER#1 -> IDENTIFIER#2 := OTHER#3 TERMINATOR"
+definition_with_parameter = "IDENTIFIER#1 : parameter#2 -> IDENTIFIER#3 := OTHER#4 TERMINATOR"
+recursion = "#basename OTHER := OTHER #basename OTHER TERMINATOR"
+replace_predef = [ "IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR OTHER IDENTIFIER#1", "#1 -> OTHER := #2 TERMINATOR OTHER (#2)" ]
+replace_postdef = [ "IDENTIFIER#1 OTHER TERMINATOR IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR", "#2 OTHER TERMINATOR #1 -> OTHER := #2 TERMINATOR" ]
+unfold_parameter = [ ": OTHER IDENTIFIER#1 ( IDENTIFIER#2 OTHER#3 ) OTHER ->", ": OTHER #1 #2 #1 ( #3 ) OTHER ->" ]
+unfold_parameter_remove_brackets = [ ": OTHER IDENTIFIER ( ) OTHER ->", ": OTHER OTHER ->" ]
+parameter = ": OTHER ->"
+
+# The following sections are used to build different output formats
+# [interpreter] refers to the builtin interpreter using a minimal subset of C syntax
+# The name of each section is only used to specify the actual output.
+[clang]
+definition = "#2 #1 () {return (#3);}"
+Logic = "int"
+Number = "long int"
+Character = "char"
+Type = "struct"
+
+[interpreter]
Author	SHA1	Message	Date
yannickreiss	2a846a5f53	Load configuration for in language syntax replacements	2025-08-26 22:43:59 +02:00
yannickreiss	f67c79c65b	Add testbench	2025-08-25 12:09:54 +02:00
yannickreiss	42fa5affb5	Reintroducing meta tokens	2025-08-25 07:12:22 +02:00
yannickreiss	015de5dc0a	implement meta token replacement	2025-08-24 20:54:20 +02:00
yannickreiss	ddba3423df	Transofrmation	2025-08-12 19:04:09 +02:00