micro/src/tokenizer.rs

314 lines
12 KiB
Rust

use std::fs;
use toml::{Table, Value};
#[derive(PartialEq)]
pub enum TokenType {
OPERAND,
TERMINATOR,
IDENTIFIER,
KEYWORD,
}
// Tokenizer
// Tokenizer and underlying functions to turn code into tokens
pub struct Tokenizer {
pub token_list: Vec<String>,
// BUG:
pub tokens: Vec<Token>,
// Grammar options from toml file
pub configuration: Table,
}
// Token
// This is a token with a token type.
pub struct Token {
pub token: String,
pub token_type: TokenType,
}
// Implementation of Tokenizer
// Functions associated with the tokenizer struct and module.
impl Tokenizer {
// @name read_configuration_from_file
// @return
// @brief Try to read configuration from an external file
// @param &mut self, configuration_filename: &str
pub fn read_configuration_from_file(&mut self, configuration_filename: &str) {
let configuration_string: String = fs::read_to_string(configuration_filename).expect(
(String::from("Could not open configuration file at: ") + configuration_filename)
.as_str(),
);
let configuration = gtoml::parse(configuration_string.as_str()).expect("TOML invalid!");
self.configuration = Table::try_from(configuration).unwrap();
// Check for token section in config, panic if not present
if !self.configuration.contains_key("token") {
panic!("Token section is not present!");
}
// Check for semantics section in config, panic if not present
if !self.configuration.contains_key("semantics") {
panic!("Section semantics is not present!");
}
}
// @name new
// @return Tokenizer
// @brief Create a new Tokenizer
// @param
pub fn new() -> Tokenizer {
let empty_tokens: Vec<Token> = vec![];
let empty_value: toml::map::Map<String, Value> = toml::map::Map::new();
let empty_token_list: Vec<String> = vec![];
Tokenizer {
tokens: empty_tokens,
token_list: empty_token_list,
configuration: empty_value,
}
}
// @name eat
// @return
// @brief Consumes a string and safes the tokens
// @param line: &str
pub fn eat(&mut self, line: &str) {
// Get token vectors from configuration
let token_table_value: &Value = self.configuration.get("token").unwrap();
let token_table: Table = Table::try_from(token_table_value).unwrap();
let mut tokens: Vec<String> = vec![line.to_string()];
let mut new_tokens: Vec<String> = vec![];
let mut token_buffer: String = String::from("");
// Iterate over tokens in token table and split tokens.
if token_table.contains_key("separator") {
let separator: Vec<Value> = token_table
.get_key_value("separator")
.unwrap()
.1
.as_array()
.unwrap()
.clone();
if separator.len() > 0 {
for token in tokens.iter() {
let mut token_feed = token.clone();
while !token_feed.is_empty() {
let mut no_match: bool = true;
for sep in separator.iter() {
if token_feed.starts_with(sep.as_str().unwrap()) {
// Reset and add token
no_match = false;
if token_buffer.len() > 0 {
new_tokens.push(token_buffer.clone());
token_buffer = String::from("");
}
let new_feed: String =
token_feed.split_off(sep.as_str().unwrap().len());
token_feed = new_feed;
}
}
if no_match {
let new_feed: String = token_feed.split_off(1);
token_buffer = token_buffer
+ String::from(token_feed.chars().next().unwrap()).as_str();
token_feed = new_feed;
}
}
// empty token
new_tokens.push(token_buffer.clone());
token_buffer = String::from("");
}
// empty token
new_tokens.push(token_buffer.clone());
token_buffer = String::from("");
}
}
tokens = new_tokens.clone();
new_tokens = vec![];
if token_table.contains_key("operands") {
let operands: Vec<Value> = token_table
.get_key_value("operands")
.unwrap()
.1
.as_array()
.unwrap()
.clone();
if operands.len() > 0 {
for token in tokens.iter() {
let mut token_feed = token.clone();
while !token_feed.is_empty() {
let mut no_match: bool = true;
for op in operands.iter() {
if token_feed.starts_with(op.as_str().unwrap()) {
// Reset and add token
no_match = false;
if token_buffer.len() > 0 {
new_tokens.push(token_buffer.clone());
}
token_buffer = String::from("");
new_tokens.push(op.as_str().unwrap().to_string());
let new_feed: String =
token_feed.split_off(op.as_str().unwrap().len());
token_feed = new_feed;
}
}
if no_match {
let new_feed: String = token_feed.split_off(1);
token_buffer = token_buffer
+ String::from(token_feed.chars().next().unwrap()).as_str();
token_feed = new_feed;
}
}
// empty token
new_tokens.push(token_buffer.clone());
token_buffer = String::from("");
}
// empty token
new_tokens.push(token_buffer.clone());
token_buffer = String::from("");
}
}
tokens = new_tokens.clone();
new_tokens = vec![];
if token_table.contains_key("terminator") {
let terminator: Vec<Value> = token_table
.get_key_value("terminator")
.unwrap()
.1
.as_array()
.unwrap()
.clone();
if terminator.len() > 0 {
for token in tokens.iter() {
let mut token_feed = token.clone();
while !token_feed.is_empty() {
let mut no_match: bool = true;
for term in terminator.iter() {
if token_feed.starts_with(term.as_str().unwrap()) {
// Reset and add token
no_match = false;
if token_buffer.len() > 0 {
new_tokens.push(token_buffer.clone());
}
token_buffer = String::from("");
new_tokens.push(term.as_str().unwrap().to_string());
let new_feed: String =
token_feed.split_off(term.as_str().unwrap().len());
token_feed = new_feed;
}
}
if no_match {
let new_feed: String = token_feed.split_off(1);
token_buffer = token_buffer
+ String::from(token_feed.chars().next().unwrap()).as_str();
token_feed = new_feed;
}
}
// empty token as token ended
new_tokens.push(token_buffer.clone());
token_buffer = String::from("");
}
// empty token
new_tokens.push(token_buffer.clone());
}
}
self.token_list.append(&mut new_tokens);
// Clean up token list
let mut cleaned_token_list: Vec<String> = vec![];
for token in self.token_list.iter() {
if token.as_str() != "" {
cleaned_token_list.push(token.to_string());
}
}
self.token_list = cleaned_token_list;
}
// @name identify_tokens
// @return
// @brief Go through all tokens and try to find them.
// @param &mut self
pub fn identify_tokens(&mut self) {
// Go through token list
let mut token_identities: Vec<Token> = vec![];
let mut found_token: bool;
let token_section: Table =
Table::try_from(self.configuration.get("token").unwrap()).unwrap();
let semantics_section: Table =
Table::try_from(self.configuration.get("semantics").unwrap()).unwrap();
for token in self.token_list.iter() {
found_token = false;
if token.as_str() == "" {
continue;
}
// Check if token is an operand
if token_section.contains_key("operands") {
let operands: Vec<Value> = token_section
.get_key_value("operands")
.unwrap()
.1
.as_array()
.unwrap()
.clone();
for operand in operands.iter() {
if operand.as_str().unwrap() == token.as_str() {
token_identities.push(Token {
token: token.clone(),
token_type: TokenType::OPERAND,
});
found_token = true;
}
}
}
if token_section.contains_key("terminator") && !found_token {
let terminator: Vec<Value> = token_section
.get_key_value("terminator")
.unwrap()
.1
.as_array()
.unwrap()
.clone();
for term in terminator.iter() {
if term.as_str().unwrap() == token.as_str() {
token_identities.push(Token {
token: token.clone(),
token_type: TokenType::TERMINATOR,
});
found_token = true;
}
}
}
if semantics_section.contains_key("keywords") && !found_token {
let keywords: Vec<Value> = semantics_section
.get_key_value("keywords")
.unwrap()
.1
.as_array()
.unwrap()
.clone();
for keyword in keywords.iter() {
if keyword.as_str().unwrap() == token.as_str() {
token_identities.push(Token {
token: token.clone(),
token_type: TokenType::KEYWORD,
});
found_token = true;
}
}
}
if !found_token {
token_identities.push(Token {
token: token.clone(),
token_type: TokenType::IDENTIFIER,
});
}
}
self.tokens = token_identities;
}
}