Compare commits

...

5 Commits

Author SHA1 Message Date
2a846a5f53 Load configuration for in language syntax replacements 2025-08-26 22:43:59 +02:00
f67c79c65b Add testbench 2025-08-25 12:09:54 +02:00
42fa5affb5 Reintroducing meta tokens 2025-08-25 07:12:22 +02:00
015de5dc0a implement meta token replacement 2025-08-24 20:54:20 +02:00
ddba3423df Transofrmation 2025-08-12 19:04:09 +02:00
9 changed files with 408 additions and 28 deletions

View File

@@ -1,5 +1,3 @@
#with language.toml
variable:=-3; c := (a+b- 3) * 23 + variable; d := c - a;Natural : Number (n) := {n >= 0};faculty : Natural (n) -> Natural := if n = 0 then 1 else faculty (n-1) * n end;
String Natural (n) := {Character * n};hello_word -> String := "Hello World!";
first_letter -> Character := 'a';
wrong -> Logic := false;date -> String := "#date_now";

View File

@@ -10,6 +10,8 @@ comments = ["^--.*", ""]
[meta.interpolation]
with = ["^#with ([\\w./]+)", "cat $1"]
date = ["#date_now", "date"]
user = ["#user", "user"]
test = ["#test", "cat ./mathlib.mlc"]
# Describes tokens to be replaced by identifiers and then later swapped back in after the tokenizer.
# All special tokens are treated as constants
@@ -27,7 +29,7 @@ operands = [":=", "->", "<=", ">=", "<", ">", "!", "+", "-", "/", "*", "(", ")",
terminator = [";"]
[semantics]
keywords = ["if", "then", "else", "end", "with"]
keywords = ["if", "then", "else", "end"]
[constants]
number = "(?:0b[01]+|0x[0-9a-fA-F]+|0[0-7]+|[1-9][0-9]*)"

4
mathlib.mlc Normal file
View File

@@ -0,0 +1,4 @@
Sigma -> Array := {0, 1, 2};
N -> Array := {3};
P -> Array := {3 -> 012};
S -> Number := 3;

View File

@@ -1,26 +1,52 @@
mod identification;
mod preprocessor;
mod syntax;
mod testcases;
mod tokenizer;
use tokenizer::*;
fn main() {
let sample_code: String = std::fs::read_to_string("example.mlc").unwrap();
let mut example_tokenizer: Tokenizer = Tokenizer::new();
// CL-Wrapper
let args: Vec<String> = std::env::args().collect();
// Adjust to following principle:
// micro [-t <target>] [-l <language.toml>] [<list of source files>]
// -t default: first found
// -l default: language.toml
//
// Either loads all source files or takes stdin input by piping code into the program
let mut raw_source_code: String = String::from("");
for i in 1..args.len() {
raw_source_code = raw_source_code
+ std::fs::read_to_string(args[i].clone())
.expect("Source file not found!")
.as_str();
}
// Load language toml
let mut meta_rules: crate::preprocessor::MetaRules =
crate::preprocessor::MetaRules::new("./language.toml");
let processed_sample_code: String = meta_rules.process(sample_code.to_owned());
example_tokenizer.read_configuration_from_file("./language.toml");
example_tokenizer.eat(processed_sample_code.as_str());
example_tokenizer.identify_tokens();
let mut tokenizer_configuration: Tokenizer = Tokenizer::new();
tokenizer_configuration.read_configuration_from_file("./language.toml");
let mut example_identifier: identification::Identifier =
identification::Identifier::new(example_tokenizer.tokens);
example_identifier.load_criteria_from_configuration(example_tokenizer.configuration);
example_identifier.identify_identifiers();
// Run preprocessor
let preprocessed_source_code: String = meta_rules.process(raw_source_code);
for token in example_identifier.tokens.iter() {
print!("{}", token.token);
// Tokenizing
tokenizer_configuration.eat(preprocessed_source_code.as_str());
tokenizer_configuration.identify_tokens();
// Reintroducing meta_tokens
for meta_token in meta_rules.special_tokens.iter() {
// Go through all tokens
for i in 0..tokenizer_configuration.tokens.len() {
if meta_token.0 == tokenizer_configuration.tokens[i].token {
tokenizer_configuration.tokens[i] = meta_token.1.clone();
break;
}
}
}
// Syntax resolving
// Apply translation
}

View File

@@ -1,5 +1,5 @@
use crate::tokenizer::Token;
use regex::{Captures, Regex};
use regex::{Captures, Match, Regex};
use toml::{Table, Value};
// MetaRules
@@ -8,7 +8,7 @@ pub struct MetaRules {
replacement_rules: Vec<(String, (String, String))>,
interpolation_rules: Vec<(String, (String, String))>,
token_rules: Vec<(String, String)>,
special_tokens: Vec<Token>,
pub special_tokens: Vec<(String, Token)>,
}
// Implementation of MetaRules
@@ -24,7 +24,7 @@ impl MetaRules {
let mut replacements: Vec<(String, (String, String))> = vec![];
let mut interpolation: Vec<(String, (String, String))> = vec![];
let mut meta_token_rules: Vec<(String, String)> = vec![];
let meta_tokens: Vec<Token> = vec![];
let meta_tokens: Vec<(String, Token)> = vec![];
let configuration = gtoml::parse(configuration_content.as_str())
.expect("[ERROR] TOML invalid in preprocessor!");
let configuration_unpacked: Table = Table::try_from(configuration).unwrap();
@@ -125,11 +125,15 @@ impl MetaRules {
println!("[INFO] Applying rule {}", rule.0);
let base_pattern: Regex = Regex::new((rule.1 .0).as_str()).unwrap();
let processed_code_replacement = processed_code.clone();
let parameter = &base_pattern
.captures(processed_code_replacement.as_str())
.unwrap()[0];
let command: &str = &base_pattern.replace(parameter, rule.1 .1.as_str());
println!("{:?}", &command);
let captures: Option<Captures> =
base_pattern.captures(processed_code_replacement.as_str());
let directive: String;
match captures {
Some(n) => directive = n.get(0).map_or("", |m| m.as_str()).to_string(),
None => continue,
};
let command: &str = &base_pattern.replace(directive.as_str(), rule.1 .1.as_str());
let subprocess = std::process::Command::new("/bin/bash")
.arg("-c")
.arg(String::from("echo \"$(") + command + ")\"")
@@ -143,6 +147,42 @@ impl MetaRules {
.to_string();
}
for token_style in self.token_rules.iter() {
println!("[INFO] Searching meta tokens of style {}", token_style.0);
// Search all occurrences
let token_pattern: Regex =
Regex::new(token_style.1.as_str()).expect("Could not assign pattern.");
let match_list: Match;
match_list = match token_pattern.find(processed_code.as_str()) {
Some(n) => n,
None => continue,
};
// Create id for each occurrence
let meta_id: String = String::from("meta_token_")
+ match_list.start().to_string().as_str()
+ "__"
+ match_list.end().to_string().as_str();
// Replace token by id
let meta_value: String = match_list.as_str().to_string();
let value_regex: Regex =
Regex::new(meta_value.as_str()).expect("Could not create pattern.");
processed_code = value_regex
.replace(processed_code.as_str(), meta_id.as_str())
.to_string();
// Safe id and token
self.special_tokens.push((
meta_id,
Token {
token: meta_value,
token_type: crate::TokenType::IDENTIFIER,
},
));
}
return processed_code;
}
}

76
src/syntax.rs Normal file
View File

@@ -0,0 +1,76 @@
use toml::{Table, Value};
// SyntaxRule
// Implementation of a syntax rule that can be applied.
#[derive(Debug)]
pub struct SyntaxRule {
pub name: String,
pub left: String,
pub right: String,
}
// Implementation of SyntaxRule
// Load and Resolve from outside
impl SyntaxRule {
// @name new
// @return SyntaxRule
// @brief Create a new syntax rule / load rule set.
// @param name_: String, left_: String, right_: String
fn new(name_: String, left_: String, right_: String) -> SyntaxRule {
SyntaxRule {
name: String::new(),
left: String::new(),
right: String::new(),
}
}
// @name load
// @return Vec<SyntaxRule>
// @brief Load configuration and retrieve transformation rules.
// @param configuration_filename: &str
pub fn load(configuration_filename: &str) -> Vec<SyntaxRule> {
let mut rules: Vec<SyntaxRule> = vec![];
let configuration_content: String = std::fs::read_to_string(configuration_filename)
.expect("[ERROR] Could not open configuration file!");
let configuration = gtoml::parse(configuration_content.as_str())
.expect("[ERROR] TOML invalid in preprocessor!");
let configuration_unpacked: Table = Table::try_from(configuration).unwrap();
let syntax_definitions: Table = match configuration_unpacked.get("syntax") {
Some(config) => config.as_table().unwrap().clone(),
None => Table::new(),
};
for key in syntax_definitions.keys() {
let rule: Value = syntax_definitions.get(key).unwrap().clone();
if rule.is_array() {
let rule_array = rule.as_array().unwrap();
let left: String = rule_array[0].to_string();
let right: String = rule_array[1].to_string();
rules.push(SyntaxRule {
name: key.to_string(),
left: left,
right: right,
});
}
}
rules
}
// @name resolve
// @return String
// @brief Applies all rules until none of them can be applied again.
// @param rules: Vec<SyntaxRule>, unsolved: String
pub fn resolve(rules: Vec<SyntaxRule>, unsolved: String) -> String {
String::new()
}
// @name transform
// @return String
// @brief Applies a rule.
// @param &mut self, unformed: String
fn transform(&mut self, unformed: String) -> String {
String::new()
}
}

View File

@@ -1,9 +1,138 @@
#[cfg(test)]
mod tests {
use super::*;
// preprocessor
#[test]
fn test_replacements() {
let mut ruleset: crate::preprocessor::MetaRules =
crate::preprocessor::MetaRules::new("./testspecs.toml");
let sut: String = ruleset.process(String::from("-- Comment to remove"));
let verify: String = String::from("");
let case_comment_at_end: String =
ruleset.process(String::from("This -- comment is not removed."));
let case_comment_at_end_verify: String = String::from("This -- comment is not removed.");
assert_eq!(sut, verify);
assert_eq!(case_comment_at_end, case_comment_at_end_verify);
}
#[test]
fn dummy_test() {
assert_eq!(2, 2);
fn test_interpolation() {
let mut ruleset: crate::preprocessor::MetaRules =
crate::preprocessor::MetaRules::new("./testspecs.toml");
let run_with_interpolation_test: String = ruleset.process(String::from("#test"));
let interpolation_verification: String = std::fs::read_to_string("./mathlib.mlc").unwrap();
assert_eq!(run_with_interpolation_test, interpolation_verification);
}
#[test]
fn test_meta_token() {
let mut ruleset: crate::preprocessor::MetaRules =
crate::preprocessor::MetaRules::new("./testspecs.toml");
let meta_token_test_string: String = ruleset.process(String::from("\"sample\""));
let meta_token_sample_string: String = String::from("\"sample\"");
let meta_token_verify: Vec<crate::tokenizer::Token> = vec![crate::tokenizer::Token {
token: meta_token_sample_string,
token_type: crate::tokenizer::TokenType::IDENTIFIER,
}];
assert_eq!(meta_token_verify.len(), ruleset.special_tokens.len());
assert_eq!(
meta_token_verify[0].token,
ruleset.special_tokens[0].1.token
);
assert_eq!(meta_token_test_string, "meta_token_0__8");
}
// Tokenizer
#[test]
fn test_eat() {
let mut sample: crate::tokenizer::Tokenizer = crate::tokenizer::Tokenizer::new();
sample.read_configuration_from_file("./testspecs.toml");
sample.eat("faculty : Natural n := if n = 0 then 1 else n * faculty (n - 1);");
assert_eq!(
sample.token_list,
vec![
"faculty", ":", "Natural", "n", ":=", "if", "n", "=", "0", "then", "1", "else",
"n", "*", "faculty", "(", "n", "-", "1", ")", ";"
]
)
}
#[test]
fn test_identify_tokens() {
let mut token_sample: crate::tokenizer::Tokenizer = crate::tokenizer::Tokenizer::new();
token_sample.read_configuration_from_file("./testspecs.toml");
token_sample.eat("id : -> 125;");
token_sample.identify_tokens();
let mut token_verify: crate::tokenizer::Tokenizer = crate::tokenizer::Tokenizer::new();
token_verify.read_configuration_from_file("./testspecs.toml");
token_verify.eat("id : -> 125;");
token_verify.tokens = vec![
crate::tokenizer::Token {
token: String::from("id"),
token_type: crate::tokenizer::TokenType::IDENTIFIER,
},
crate::tokenizer::Token {
token: String::from(":"),
token_type: crate::tokenizer::TokenType::OPERAND,
},
crate::tokenizer::Token {
token: String::from("->"),
token_type: crate::tokenizer::TokenType::OPERAND,
},
crate::tokenizer::Token {
token: String::from("125"),
token_type: crate::tokenizer::TokenType::IDENTIFIER,
},
crate::tokenizer::Token {
token: String::from(";"),
token_type: crate::tokenizer::TokenType::TERMINATOR,
},
];
assert_eq!(token_sample.configuration, token_verify.configuration);
assert_eq!(token_sample.tokens.len(), token_verify.tokens.len());
assert_eq!(token_sample.token_list.len(), token_verify.token_list.len());
}
// @name test_syntax_load
// @return
// @brief
// @param
#[test]
fn test_syntax_load() {
let test: Vec<crate::syntax::SyntaxRule> =
crate::syntax::SyntaxRule::load("./testspecs.toml");
let verify: Vec<crate::syntax::SyntaxRule> = vec![
crate::syntax::SyntaxRule {
name: String::from("replace_predef"),
left: String::from(
"IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR OTHER IDENTIFIER#1",
),
right: String::from("#1 -> OTHER := #2 TERMINATOR OTHER (#2)"),
},
crate::syntax::SyntaxRule {
name: String::from("replace_postdef"),
left: String::from(
"IDENTIFIER#1 OTHER TERMINATOR IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR",
),
right: String::from("#2 OTHER TERMINATOR #1 -> OTHER := #2 TERMINATOR"),
},
crate::syntax::SyntaxRule {
name: String::from("unfold_parameter"),
left: String::from(": OTHER IDENTIFIER#1 ( IDENTIFIER#2 OTHER#3 ) OTHER ->"),
right: String::from(": OTHER #1 #2 #1 ( #3 ) OTHER ->"),
},
crate::syntax::SyntaxRule {
name: String::from("unfold_parameter_remove_brackets"),
left: String::from(": OTHER IDENTIFIER ( ) OTHER ->"),
right: String::from(": OTHER OTHER ->"),
},
];
assert_eq!(test.len(), verify.len());
}
}

View File

@@ -1,7 +1,7 @@
use std::fs;
use toml::{Table, Value};
#[derive(PartialEq)]
#[derive(PartialEq, Debug)]
pub enum TokenType {
OPERAND,
TERMINATOR,
@@ -21,10 +21,27 @@ pub struct Tokenizer {
// Token
// This is a token with a token type.
#[derive(Debug)]
pub struct Token {
pub token: String,
pub token_type: TokenType,
}
impl Clone for Token {
fn clone(&self) -> Token {
let token_type: TokenType = match self.token_type {
TokenType::OPERAND => TokenType::OPERAND,
TokenType::KEYWORD => TokenType::KEYWORD,
TokenType::TERMINATOR => TokenType::TERMINATOR,
TokenType::IDENTIFIER => TokenType::IDENTIFIER,
};
Token {
token: self.token.clone(),
token_type: token_type,
}
}
}
// Implementation of Tokenizer
// Functions associated with the tokenizer struct and module.
impl Tokenizer {
@@ -212,6 +229,15 @@ impl Tokenizer {
}
}
self.token_list.append(&mut new_tokens);
// Clean up token list
let mut cleaned_token_list: Vec<String> = vec![];
for token in self.token_list.iter() {
if token.as_str() != "" {
cleaned_token_list.push(token.to_string());
}
}
self.token_list = cleaned_token_list;
}
// @name identify_tokens

79
testspecs.toml Normal file
View File

@@ -0,0 +1,79 @@
# Meta rules are separate rules with priority over all other rules.
# They can be compared to preprocessor directives, but are more powerful.
# Pattern matching in preprocessor style, is running at highest priority before anything else.
[meta.replacements]
comments = ["^--.*", ""]
# Interpolation with a shell, replaces the meta pattern by the interpolation result.
# Passing arguments is supported through groups and #<parameter number> in the shell command.
[meta.interpolation]
with = ["^#with ([\\w./]+)", "cat $1"]
date = ["#date_now", "date"]
user = ["#user", "user"]
test = ["#test", "cat ./mathlib.mlc"]
# Describes tokens to be replaced by identifiers and then later swapped back in after the tokenizer.
# All special tokens are treated as constants
[meta.token]
string_constant = "\".*?\""
char_constant = "'.'"
# Every key below is used as type in an enumerate to sort the tokens
# -> Replacement in order
# -> Every amount of other symbols is saved as some kind of value
# -> Those are using the default type "identifier"
[token]
separator = [" ", ",", "\n"]
operands = [":=", "->", "<=", ">=", "<", ">", "!", "+", "-", "/", "*", "(", ")", "[", "]", "{", "}", "=", "?", ":"]
terminator = [";"]
[semantics]
keywords = ["if", "then", "else", "end"]
[constants]
number = "(?:0b[01]+|0x[0-9a-fA-F]+|0[0-7]+|[1-9][0-9]*)"
character = "'.'"
logic = "(true|false)"
[types]
Number = "number"
Character = "character"
Type = ""
Array = "{character * number}"
Logic = "logic"
# List of rules
# Rules can be found in traces
# use better names than rule_1, rule_2, ...
# The compiler will run through all rules trying to match exactly one.
# Uses the following generic types:
# - OPERAND
# - IDENTIFIER
# - KEYWORD
# - TERMINATOR
# - OTHER (Use this type for ambiguous parts. Same as lazy .+ in regular expressions)
# Definition of custom types are possible, by creation of a rule with the same name.
# IMPORTANT: Rules are always top priority and can overwrite other types.
# Named placeholders: The character # is reserved for named placeholders. They are only valid inside a rule.
[syntax]
definition = "IDENTIFIER#1 -> IDENTIFIER#2 := OTHER#3 TERMINATOR"
definition_with_parameter = "IDENTIFIER#1 : parameter#2 -> IDENTIFIER#3 := OTHER#4 TERMINATOR"
recursion = "#basename OTHER := OTHER #basename OTHER TERMINATOR"
replace_predef = [ "IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR OTHER IDENTIFIER#1", "#1 -> OTHER := #2 TERMINATOR OTHER (#2)" ]
replace_postdef = [ "IDENTIFIER#1 OTHER TERMINATOR IDENTIFIER#1 -> OTHER := OTHER#2 TERMINATOR", "#2 OTHER TERMINATOR #1 -> OTHER := #2 TERMINATOR" ]
unfold_parameter = [ ": OTHER IDENTIFIER#1 ( IDENTIFIER#2 OTHER#3 ) OTHER ->", ": OTHER #1 #2 #1 ( #3 ) OTHER ->" ]
unfold_parameter_remove_brackets = [ ": OTHER IDENTIFIER ( ) OTHER ->", ": OTHER OTHER ->" ]
parameter = ": OTHER ->"
# The following sections are used to build different output formats
# [interpreter] refers to the builtin interpreter using a minimal subset of C syntax
# The name of each section is only used to specify the actual output.
[clang]
definition = "#2 #1 () {return (#3);}"
Logic = "int"
Number = "long int"
Character = "char"
Type = "struct"
[interpreter]