/* tokenizer.d */ module tokenizer; import std.string; import std.stdio; enum tokenizer_mode { NONE, STRING, CHAR, NAME }; string[] tokenize(string data) { string[] tokens = []; tokenizer_mode mode = tokenizer_mode.NONE; string curr_token = ""; while (data.length > 0) { switch (mode) { case tokenizer_mode.NONE: if (iswhite(data[0])) { data = data[1..data.length]; } else if (data[0] == ';') { int idx = find(data, '\n'); // read until next newline if (idx > -1) data = data[idx..data.length]; else data = ""; // everything left is a comment } else if (data[0] == '"') { curr_token = data[0..1]; data = data[1..data.length]; mode = tokenizer_mode.STRING; } else if (data.length >= 2 && data[0..2] == ",@") { tokens ~= data[0..2]; data = data[2..data.length]; } else if (find("`',()", data[0]) > -1) { tokens ~= data[0..1]; data = data[1..data.length]; } else if (data.length >= 2 && data[0..2] == r"#\") { // does Liquid/D even have a character literal? mode = tokenizer_mode.CHAR; curr_token ~= data[0..2]; data = data[2..data.length]; } else { mode = tokenizer_mode.NAME; curr_token ~= data[0..1]; data = data[1..data.length]; } break; case tokenizer_mode.STRING: if (data.length >= 2 && data[0..2] == "\\\"") { curr_token ~= data[0..2]; data = data[2..data.length]; } else if (data[0] == '"') { curr_token ~= data[0]; tokens ~= curr_token; curr_token = ""; data = data[1..data.length]; mode = tokenizer_mode.NONE; } else { curr_token ~= data[0]; data = data[1..data.length]; } break; case tokenizer_mode.CHAR: if (curr_token.length <= 2) { // add the next character no matter what it is curr_token ~= data[0]; data = data[1..data.length]; } else { if (find(" \t\n\r()", data[0]) > -1) { tokens ~= curr_token; curr_token = ""; mode = tokenizer_mode.NONE; } else { curr_token ~= data[0]; data = data[1..data.length]; } } break; case tokenizer_mode.NAME: if (find(" \t\n\r()", data[0]) > -1) { // end of name tokens ~= curr_token; curr_token = ""; mode = tokenizer_mode.NONE; } else { curr_token ~= data[0]; data = data[1..data.length]; } break; } } if (curr_token > "") tokens ~= curr_token; return tokens; } unittest { writefln("unittest: tokenizer"); string[] tokens; assert(tokenize("x") == ["x"]); tokens = tokenize("(foo bar)"); assert(tokens == ["(", "foo", "bar", ")"], join(tokens, " ")); tokens = tokenize("(print \"a b c\")"); assert(tokens == ["(", "print", "\"a b c\"", ")"], join(tokens, " ")); tokens = tokenize(" \n(foo (bar (baz))) "); assert(tokens == ["(", "foo", "(", "bar", "(", "baz", ")", ")", ")"], join(tokens, " ")); }